CampusROAR: A Simple Python Crawler

Over the last week I’ve been building a simple Python web crawler for grabbing RSS feeds from a domain.  The one I built in Bash using command line tools (namely wget) has several downsides which needed fixing so I started work on a customized Python one.  Wget did not allow filtering of pages by HTTPHeader/mimetype, and while it did allow filtering by file extension, many of the files on the domain I was scraping were extensionless so couldn’t be filtered.  There was also no way to filter by filesize – so I couldn’t restrict it to smaller files instead which would have been an ok solution.  Instead the crawler had to download huge files on occasion which either had no file extension or was one I had not thought to blacklist.

The Python crawler I created had several advantages therefore.  It first checks the HTTPHeader, and only requests the full page if it’s HTML or XML (to either get all the links off of, or to check for RSS in the case of XML).  It then parses and processes the full page and gets the links from it, or if it’s an RSS or ATOM feed, adds it to a list of them.  You have to give it a starting page and a domain restriction – it starts crawling from one and cannot leave the given domain.  If you did not give it a domain restriction it would probably continue forever (or at least until you had spidered the whole internet).

crawl.py

import config
import urllib2
import feedparser
import lxml.html
import urlparse
import datetime

#Globals
ignore = []
feeds = []
urllist = set()
output = "feeds.txt"

def main():

 url = config.START_URL

 FILE = open(output,"a")
 FILE.write("CRAWL STARTED ON: "+ str(datetime.datetime.today()) +"\n")
 FILE.close()

 process_url(url)

 while len(urllist) > 0:
 process_url(url)
 url = urllist.pop()

 FILE = open(output,"a")
 FILE.write("END OF CRAWL.\n")
 FILE.close()

 print "URLs: ", urllist
 print "Feeds: ", feeds
 print "Ignore: ", ignore

class HeadRequest(urllib2.Request):
 ## Custom Headrequest Request class for urllib2 to get page headers.
 def get_method(self):
 return "HEAD"

def getPageMime(url):

 #########################################
 #Determines the mimetype of the given url.
 #########################################

 try:
 response = urllib2.urlopen(HeadRequest(url), timeout = 20)
 content = response.info()["content-type"]
 contents = content.split(";")
 return contents[0]
 except:
 return None

def getPageAndParse(url, contenttype):

 #########################################
 #Takes a url and mimetype and parses, returning a list of valid urls in the page,
 #and a flag which is True if the page is an RSS or Atom feed and false otherwise.
 #########################################

 response = urllib2.urlopen(url)
 page = response.read()
 address = response.url

 feed = False
 if (contenttype == "application/rss+xml" or contenttype =="application/xhtml+xml"):
 feed = parseforRSS(page, address)

 urls = []    
 if (contenttype == "text/html"):
 urls = parseforURLs(page, address)

 return (feed, urls)

## Parse page for RSS tags and return true if found.    
def parseforRSS(page, address):

 #########################################
 #Takes a page and its url, returns True if the page is an RSS or Atom feed.
 #########################################
 if (feedparser.parse(page).version):

 return True
 return False

def parseforURLs(page, address):

 #########################################
 #Takes a page and its url, returns a list of absolute urls linked to on the page.
 #########################################

 try:
 webpage = lxml.html.fromstring(page)
 urls = webpage.xpath('//a/@href')
 validurls = []
 for item in urls:
 if urlparse.urljoin(address, item).startswith('http'):
 validurls.append(urlparse.urljoin(address, item))
 return validurls
 except lxml.etree.XMLSyntaxError:
 return []

def process_url(url):

 #########################################
 #Processes a url for extracting links or finding feeds.
 #########################################

 print "Processing url: ", url

 ignore.append(url)
 mime = getPageMime(url)
 if (mime == "application/rss+xml" or mime == "application/xhtml+xml" or mime == "text/html"):
 result = getPageAndParse(url,mime)
 if result[0] == True:
 if (config.DOMAIN in urlparse.urlparse(item).netloc):
 feeds.append(url)
 FILE = open(output,"a")
 FILE.write(url)
 FILE.close()
 else:
 for item in result[1]:
 if item not in ignore:
 if (config.DOMAIN in urlparse.urlparse(item).netloc):
 urllist.add(item)

if __name__ == "__main__":
 main()

config.py

## Settings for RSSCrawl web crawler.

CRAWLER_NAME = 'RSSCrawl'

START_URL = 'http://www.ecs.soton.ac.uk'
DOMAIN = 'www.ecs.soton.ac.uk'
Tagged with:

Leave a Reply

Your email address will not be published. Required fields are marked *

*