#!/bin/python
#
#	sitemap genarator that parses links
#	ver. 1.4
#	assumptions
#	- path names must be valid utf-8
#
#	auther: Kenji Arisawa
#
import sys,os
import re
import time
import stat
import urllib
import os.path

isdir = os.path.isdir

def errmsg(s):
	sys.stderr.write(s+"\n")

def usage():
	errmsg("usage: sitemapgen [-t type] sitemap.conf")
	errmsg('type is one of "list" or "xml" (default is "xml")')
	sys.exit()

def mtime(file): # mtime in ISO8601 time string
	#print '#', file
	t    = os.stat(docroot+"/"+file)[stat.ST_MTIME];
	return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(t))

def basename(path):
	n = path.rfind("/")
	if n == -1:
		return ""
	else:
		return path[:n+1]

def cleanname(path):
	root = False
	tail = False
	if path[-1] == "/":
		tail = True
	if path[0] == "/":
		root = True
		path = path[1:]
	p = path.split("/")
	q = []
	for name in p:
		#print q,name
		if name == "." or name == "":
			"""do nothing"""
		elif name == ".." and len(q) > 0:
			q.pop()
		else:
			q.append(name)
	path = "/".join(q)
	if root:
		path = "/" + path
	if tail:
		path = path + "/"
	return path

def list_output(fn,t):
	print "%s/%s lastmod=%s"%(urlbase,urlencode(fn),t)

def xml_output(fn,t):
	print " <url>"
	print "  <loc>%s/%s</loc>"%(urlbase,urlencode(fn))
	print "  <lastmod>%s</lastmod>"%t
	print " </url>"

def panic(msg):
	errmsg(msg)
	sys.exit()

def doit(fn, parent):
	# fn is a path of the host from the document root
	if fn[-1] == "/":
		fn = fn + default
	elif chkdir and isdir(fn): # follow directory without trailing slash?
		fn = fn + "/" + default
	if fn[0] == "/":
		fn = fn[1:]
	fn = cleanname(fn)
	if re_exclude.match(fn):
		return
	if fn in dic:
		return

	b = basename(fn)
	# basename
	# foo.html	-> ""
	# /foo.html	->	"/"
	# bar/foo.html	-> "bar/"
	# /bar/foo.html	-> "/bar/"
	try:
		t=mtime(fn)
		output(fn,t)
	except OSError:
		errmsg("# %s not found (referenced in %s)"%(fn,parent))
		return
	dic[fn] = None
	if fn[-len(".html"):] != ".html":
		return

	try:
		f=open(docroot+ "/" + fn)
	except:
		return
	s=f.read()
	f.close()
	s=re.sub(r"<!--.*-->","",s)		# remove comments
	s=re.sub(r"<script .*</script>","",s)	# remove javascript
	m0=r0.match(s)
	while(m0):
		m1=m0.regs[1][0]
		m2=m0.regs[1][1]
		t=s[m1:m2]
		m=None
		for r in r1:
			m=r.match(t)
			if(m):
				break
		if m:
			v=t[m.regs[2][0]:m.regs[2][1]]
			u = None
			if v[0] == '"' or v[0] == "'":
				n=v.find(v[0],1)
				if n == -1:
					panic("# quote error: %s: %s"%(fn,t))
				u = v[1:n]
			else:
				n=v.find(" ",1)
				if n == -1:
					n = len(v)
				u = v[0:n]
			n =u.find("#",1)
			if n != -1:
				u = u[0:n]
			n =u.find("?",1)
			if n != -1:
				u = u[0:n]
			m=re_url.match(u)
			proto=u[m.regs[1][0]:m.regs[1][1]]
			if proto == "" or proto == "http:":
				u=u[m.regs[2][0]:m.regs[2][1]]
				# u is
				# //host/path
				# /path
				# path
				# #name
				if u[0] != '#' and u[:2] != "//":
					# the u might be urlencoded
					u = urldecode(u)
					if b == "" or b=="/" or u[0]=="/":
						doit(u, fn)
					else:
						doit(b+u, fn)
		s=s[m0.regs[0][1]:]
		m0=r0.match(s)

def putit(d):
	for f in os.listdir(docroot+"/"+d):
		if f[0] == ".":
			continue
		g = d + "/" + f
		if re_exclude.match(g):
			continue
		mode = os.stat(docroot+"/"+g)[stat.ST_MODE]
		if stat.S_ISDIR(mode):
			putit(g)
		else:
			t=mtime(g)
			output(g,t)
			dic[g] = None


urlencode=urllib.quote		# encode only the path part
urldecode=urllib.unquote	# decode

dic={}
args=sys.argv[1:]

r0=re.compile(r'[^<]*<([^>]*)>')

r1_href0=re.compile(r'(a|A|area|AREA)\s+href=(.*)')
r1_href1=re.compile(r'(a|A|area|AREA)\s+.*\s+href=(.*)')
r1_src0=re.compile(r'(img|IMG|embed|EMBED|iframe|IFRAME)\s+src=(.*)')
r1_src1=re.compile(r'(img|IMG|embed|EMBED|iframe|IFRAME)\s+.*\s+src=(.*)')
r1_data0=re.compile(r'(object|OBJECT)\s+data=(.*)')
r1_data1=re.compile(r'(object|OBJECT)\s+.*\s+data=(.*)')
re_url=re.compile(r'([A-Za-z]+:|)(.*)')

r1=(r1_href0,r1_href1,r1_src0,r1_src1,r1_data0,r1_data1)

r2=re.compile(r'(http|https|ftp):')

output = xml_output

if len(args) == 0:
	usage()

if args[0] == "-t":
	if args[1] == "list":
		output = list_output
	elif args[1] != "xml":
		usage()
	args = args[2:]

conf=args[0]

f=open(conf)
s=f.read()
exec(s)
f.close()

# urlbase is "http://ar.aichi-u.ac.jp" for example
hosturl=r2.sub("",urlbase)
# hosturl is "//ar.aichi-u.ac.jp" for example


re_exclude = None

try:
	exclude
except:
	exclude = None

if exclude != None:
	re_exclude = re.compile(exclude)

if output == xml_output:
	print '<?xml version="1.0" encoding="UTF-8"?>'
	print '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'

try:
	expose
except:
	expose = None

if expose != None:
	if type(expose) == type(""):
		expose = (expose,)
	for e in expose:
		putit(e)

doit(root, None)

if output == xml_output:
	print "</urlset>"
