| 1 | #!env python |
|---|
| 2 | |
|---|
| 3 | """ |
|---|
| 4 | Module to fetch the 1736 Canting Dictionary from: |
|---|
| 5 | http://www.fromoldbooks.org/NathanBailey-CantingDictionary/transcription.html |
|---|
| 6 | """ |
|---|
| 7 | |
|---|
| 8 | import HTMLParser |
|---|
| 9 | import urllib2 |
|---|
| 10 | import urllib |
|---|
| 11 | |
|---|
| 12 | class LetterPageParser(HTMLParser.HTMLParser, object): |
|---|
| 13 | def __init__(self, sBaseUrl): |
|---|
| 14 | self._aURLs = [] |
|---|
| 15 | self._bInTd = False |
|---|
| 16 | self._sBaseUrl = sBaseUrl |
|---|
| 17 | super(LetterPageParser, self).__init__() |
|---|
| 18 | |
|---|
| 19 | def get_urls(self): |
|---|
| 20 | """Return the URLs""" |
|---|
| 21 | return list(self._aURLs) |
|---|
| 22 | |
|---|
| 23 | def reset(self): |
|---|
| 24 | """Reset the parser""" |
|---|
| 25 | super(LetterPageParser, self).reset() |
|---|
| 26 | self._aURLs = [] |
|---|
| 27 | self._bInTd = False |
|---|
| 28 | |
|---|
| 29 | def handle_starttag(self, sTag, aAttr): |
|---|
| 30 | if not self._bInTd: |
|---|
| 31 | if sTag.lower() == "td": |
|---|
| 32 | self._bInTd = True |
|---|
| 33 | else: |
|---|
| 34 | if sTag.lower() == "a": |
|---|
| 35 | dAttr = dict(aAttr) |
|---|
| 36 | self._aURLs.append( |
|---|
| 37 | urllib.basejoin(self._sBaseUrl, dAttr["href"]) |
|---|
| 38 | ) |
|---|
| 39 | |
|---|
| 40 | def handle_endtag(self, sTag): |
|---|
| 41 | if self._bInTd: |
|---|
| 42 | if sTag.lower() == "td": |
|---|
| 43 | self._bInTd = False |
|---|
| 44 | |
|---|
| 45 | def handle_data(self, sData): pass |
|---|
| 46 | def handle_charref(self, sName): pass |
|---|
| 47 | def handle_entityref(self, sName): pass |
|---|
| 48 | |
|---|
| 49 | |
|---|
| 50 | class WordPageParser(HTMLParser.HTMLParser, object): |
|---|
| 51 | def __init__(self): |
|---|
| 52 | self._sWord = None |
|---|
| 53 | self._sDef = None |
|---|
| 54 | self._sData = "" |
|---|
| 55 | self._bWordP = False |
|---|
| 56 | super(WordPageParser, self).__init__() |
|---|
| 57 | |
|---|
| 58 | def get_word(self): |
|---|
| 59 | """Return the URLs""" |
|---|
| 60 | return self._sWord, self._sDef |
|---|
| 61 | |
|---|
| 62 | def reset(self): |
|---|
| 63 | """Reset the parser""" |
|---|
| 64 | self._sWord = None |
|---|
| 65 | self._sDef = None |
|---|
| 66 | self._sData = "" |
|---|
| 67 | super(WordPageParser, self).reset() |
|---|
| 68 | |
|---|
| 69 | def handle_starttag(self, sTag, aAttr): |
|---|
| 70 | if sTag.lower() == "p": |
|---|
| 71 | self._sData = "" |
|---|
| 72 | self._bWordP = False |
|---|
| 73 | elif sTag.lower() == "span": |
|---|
| 74 | dAttr = dict(aAttr) |
|---|
| 75 | if dAttr.get("class") == "headword": |
|---|
| 76 | self._bWordP = True |
|---|
| 77 | |
|---|
| 78 | def handle_endtag(self, sTag): |
|---|
| 79 | if self._bWordP: |
|---|
| 80 | if sTag.lower() == "p": |
|---|
| 81 | self._sDef = self._sData |
|---|
| 82 | elif sTag.lower() == "span": |
|---|
| 83 | self._sWord = self._sData |
|---|
| 84 | |
|---|
| 85 | def handle_data(self, sData): |
|---|
| 86 | self._sData += sData |
|---|
| 87 | |
|---|
| 88 | def handle_charref(self, sName): pass |
|---|
| 89 | def handle_entityref(self, sName): pass |
|---|
| 90 | |
|---|
| 91 | |
|---|
| 92 | def fetch_word(sUrl): |
|---|
| 93 | """Fetch the word from sUrl, returning the word and definition.""" |
|---|
| 94 | oP = WordPageParser() |
|---|
| 95 | f = urllib2.urlopen(sUrl) |
|---|
| 96 | try: |
|---|
| 97 | oP.feed(f.read()) |
|---|
| 98 | finally: |
|---|
| 99 | f.close() |
|---|
| 100 | return oP.get_word() |
|---|
| 101 | |
|---|
| 102 | def fetch_letter(sUrl): |
|---|
| 103 | """Fetch all the words from the letter page sUrl.""" |
|---|
| 104 | oP = LetterPageParser(sUrl) |
|---|
| 105 | f = urllib2.urlopen(sUrl) |
|---|
| 106 | try: |
|---|
| 107 | oP.feed(f.read()) |
|---|
| 108 | finally: |
|---|
| 109 | f.close() |
|---|
| 110 | |
|---|
| 111 | for sWordUrl in oP.get_urls(): |
|---|
| 112 | yield fetch_word(sWordUrl) |
|---|
| 113 | |
|---|
| 114 | def fetch_all(sUrl): |
|---|
| 115 | """Fetch all the words using sUrl as the base url.""" |
|---|
| 116 | for sL in list("ABCDEFGH") + ["IJ"] + \ |
|---|
| 117 | list("KLMNOPQRST") + ["UV"] + \ |
|---|
| 118 | list("XYZ"): |
|---|
| 119 | for tWordDef in fetch_letter("%s/%s/" % (sUrl, sL)): |
|---|
| 120 | yield tWordDef |
|---|
| 121 | |
|---|
| 122 | if __name__ == "__main__": |
|---|
| 123 | import sys |
|---|
| 124 | |
|---|
| 125 | sBaseUrl = "http://www.fromoldbooks.org/NathanBailey-CantingDictionary/" |
|---|
| 126 | |
|---|
| 127 | for sWord, sDef in fetch_all(sBaseUrl): |
|---|
| 128 | sys.stdout.write(sWord) |
|---|
| 129 | sys.stdout.write("\n ") |
|---|
| 130 | sys.stdout.write(sDef) |
|---|
| 131 | sys.stdout.write("\n\n") |
|---|
| 132 | sys.stdout.flush() |
|---|