#!/usr/bin/env python
# encoding: utf-8
"""
feedextractor.py

Created by Jamie Grove on 2008-01-30.

"""

from BeautifulSoup import BeautifulSoup
import urllib2
from xml.dom import minidom
from urlparse import urlparse
import socket

timeout = 15
socket.setdefaulttimeout(timeout)

subscribedsites = []
subscribedfeeds = []

# put your seed url here
baseurl = 'http://www.please-change-to-some-site.com/or-full/url.html'

# loadsubscriptions - imports your current opml list
def loadsubscriptions():
	global subscribedsites,subscribedfeeds
	dom = minidom.parse('mysubscriptions.opml')
	for node in dom.getElementsByTagName('outline'):
		subscribedsites.append(node.attributes['htmlUrl'].value)
		subscribedfeeds.append(node.attributes['xmlUrl'].value)

# gethtml(url) - fairly obvious, right?
def gethtml(url):
	html = urllib2.urlopen(url).read()
	return html

# extractlinks(html)  - pulls out all the anchor tags from the html, skips sites you already have
#   uses beautifulsoup to extract links
#   1) checks to see if the netloc of the anchor is in the list of subscribed sites
#   2) checks to see if the netloc of the anchor is in the list of links (keeps out the dupes)
#   3) checks to see if the netloc of the anchor is in the seed url
def extractlinks(html):
	global subscribedsites,subscribedfeeds,baseurl
	soup = BeautifulSoup(html)
	anchors = soup.findAll('a')
	links = []
	for a in anchors:
		o = urlparse(a['href'])
		if len([s for s in subscribedsites if o.netloc in s]) == 0 and len([s for s in links if o.netloc in s]) == 0 and o.netloc not in baseurl:
			links.append(a['href'])
	return links
	
# getfeed(html) - looks for feed URLs in the html you pass in
#   uses beautifulsoup to extract links
#   same basic logic as extract links to make sure you only get feeds you don't have
def getfeed(html):
	global subscribedsites,subscribedfeeds,baseurl
	soup = BeautifulSoup(html)
	linkedfiles = soup.findAll('link')
	feed = []
	for l in linkedfiles:
		if l.has_key('rel'):
			if l['rel'] == 'alternate':
				o = urlparse(l['href'])
				if len([s for s in subscribedfeeds if o.netloc in s]) == 0 and len([s for s in feed if o.netloc in s['href']]) == 0:
					feed.append({'href':l['href'],'title':l['title']})
	return feed
	
# main - unimaginative?  yes, but it works
#   1) creates a opml stub
#   2) grabs the seed page and parses it for new links
#   3) goes out and gets feeds (if they exist)
#   4) adds feeds to the stub opml
#   5) writes the opml file out for import elsewhere
def main():
	global subscribedsites,subscribedfeeds,baseurl
	loadsubscriptions()
	html = gethtml(baseurl)
	links = extractlinks(html)
	xml = minidom.Document()
	opml = xml.createElement('opml')
	opml.appendChild(xml.createElement('head'))
	body = xml.createElement('body')
	print '%d links' % len(links)
	counter = 0
	for l in links:
		counter = counter + 1
		print 'processing link %d - %s' % (counter,l.encode('latin-1'))
		try:
			html = gethtml(l)
			feed = getfeed(html)
			if len(feed) > 0:
				for f in feed:
					outline = xml.createElement('outline')
					outline.setAttribute('title',f['title'])
					outline.setAttribute('htmlUrl',l)
					outline.setAttribute('xmlUrl',f['href'])
					body.appendChild(outline)
		except:
			print 'Could not get %s'% l.encode('latin-1')
	opml.appendChild(body)
	xml.appendChild(opml)
	fp = open("newfeeds.opml","w")
	# writexml(self, writer, indent='', addindent='', newl='', encoding=None)
	xml.writexml(fp, "    ", "", "\n", "UTF-8")
	
if __name__ == '__main__':
	main()

