#!/bin/python # # sitemap genarator that parses links # ver. 1.4 # assumptions # - path names must be valid utf-8 # # auther: Kenji Arisawa # import sys,os import re import time import stat import urllib import os.path isdir = os.path.isdir def errmsg(s): sys.stderr.write(s+"\n") def usage(): errmsg("usage: sitemapgen [-t type] sitemap.conf") errmsg('type is one of "list" or "xml" (default is "xml")') sys.exit() def mtime(file): # mtime in ISO8601 time string #print '#', file t = os.stat(docroot+"/"+file)[stat.ST_MTIME]; return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(t)) def basename(path): n = path.rfind("/") if n == -1: return "" else: return path[:n+1] def cleanname(path): root = False tail = False if path[-1] == "/": tail = True if path[0] == "/": root = True path = path[1:] p = path.split("/") q = [] for name in p: #print q,name if name == "." or name == "": """do nothing""" elif name == ".." and len(q) > 0: q.pop() else: q.append(name) path = "/".join(q) if root: path = "/" + path if tail: path = path + "/" return path def list_output(fn,t): print "%s/%s lastmod=%s"%(urlbase,urlencode(fn),t) def xml_output(fn,t): print " " print " %s/%s"%(urlbase,urlencode(fn)) print " %s"%t print " " def panic(msg): errmsg(msg) sys.exit() def doit(fn, parent): # fn is a path of the host from the document root if fn[-1] == "/": fn = fn + default elif chkdir and isdir(fn): # follow directory without trailing slash? fn = fn + "/" + default if fn[0] == "/": fn = fn[1:] fn = cleanname(fn) if re_exclude.match(fn): return if fn in dic: return b = basename(fn) # basename # foo.html -> "" # /foo.html -> "/" # bar/foo.html -> "bar/" # /bar/foo.html -> "/bar/" try: t=mtime(fn) output(fn,t) except OSError: errmsg("# %s not found (referenced in %s)"%(fn,parent)) return dic[fn] = None if fn[-len(".html"):] != ".html": return try: f=open(docroot+ "/" + fn) except: return s=f.read() f.close() s=re.sub(r"","",s) # remove comments s=re.sub(r"