# bulkdownload.py # # Downloads all Dutch etexts from Project Gutenberg's website. # Todo: Implement other languages as well. # # Software by Michiel Overtoom, motoom@xs4all.nl, july 2009. import urllib2 import re import os # To isolate etext book numbers from the index. # Index sourec looks like:
  • Over literatuur
  • hrefpat = re.compile("href=\"\/etext\/([0-9]{5})\"") # Fetch id's of texts. No need to parse the HTML source, since we only need to grab numbers. ids = set() f = urllib2.urlopen("http://www.gutenberg.org/browse/languages/nl") # all dutch etexts (approx. 400) for line in f: m = hrefpat.search(line) if m: ids.add(m.group(1)) # 17077 f.close() # Fetch etexts from locations like http://www.gutenberg.org/files/25257/25257-8.txt for id in ids: ofn = "%s-8.txt" % id if os.path.isfile(ofn): print "Already exists:", ofn continue url = "http://www.gutenberg.org/files/%s/%s-8.txt" % (id,id) print url try: f = urllib2.urlopen(url) except urllib2.HTTPError: print "Warning: Can't fetch:", url continue of = open("%s-8.txt" % id,"wb") of.write(f.read()) of.close() f.close()