root/hodgestar/PythonCode/SmallScripts/fetch-canting.py

Revision 409, 3.5 kB (checked in by simon, 4 years ago)

Python script for downloading thieve's canting and result.

  • Property svn:mime-type set to text/python-source
  • Property svn:eol-style set to native
Line 
1#!env python
2
3"""
4Module to fetch the 1736 Canting Dictionary from:
5http://www.fromoldbooks.org/NathanBailey-CantingDictionary/transcription.html
6"""
7
8import HTMLParser
9import urllib2
10import urllib
11
12class LetterPageParser(HTMLParser.HTMLParser, object):
13    def __init__(self, sBaseUrl):
14        self._aURLs = []
15        self._bInTd = False
16        self._sBaseUrl = sBaseUrl
17        super(LetterPageParser, self).__init__()
18
19    def get_urls(self):
20        """Return the URLs"""
21        return list(self._aURLs)
22
23    def reset(self):
24        """Reset the parser"""
25        super(LetterPageParser, self).reset()
26        self._aURLs = []
27        self._bInTd = False
28
29    def handle_starttag(self, sTag, aAttr):
30        if not self._bInTd:
31            if sTag.lower() == "td":
32                self._bInTd = True
33        else:
34            if sTag.lower() == "a":
35                dAttr = dict(aAttr)
36                self._aURLs.append(
37                    urllib.basejoin(self._sBaseUrl, dAttr["href"])
38                )
39
40    def handle_endtag(self, sTag):
41        if self._bInTd:
42            if sTag.lower() == "td":
43                self._bInTd = False
44
45    def handle_data(self, sData): pass
46    def handle_charref(self, sName): pass
47    def handle_entityref(self, sName): pass
48
49
50class WordPageParser(HTMLParser.HTMLParser, object):
51    def __init__(self):
52        self._sWord = None
53        self._sDef = None
54        self._sData = ""
55        self._bWordP = False
56        super(WordPageParser, self).__init__()
57
58    def get_word(self):
59        """Return the URLs"""
60        return self._sWord, self._sDef
61
62    def reset(self):
63        """Reset the parser"""
64        self._sWord = None
65        self._sDef = None
66        self._sData = ""
67        super(WordPageParser, self).reset()
68
69    def handle_starttag(self, sTag, aAttr):
70        if sTag.lower() == "p":
71            self._sData = ""
72            self._bWordP = False
73        elif sTag.lower() == "span":
74            dAttr = dict(aAttr)
75            if dAttr.get("class") == "headword":
76                self._bWordP = True
77
78    def handle_endtag(self, sTag):
79        if self._bWordP:
80            if sTag.lower() == "p":
81                self._sDef = self._sData
82            elif sTag.lower() == "span":
83                self._sWord = self._sData
84
85    def handle_data(self, sData):
86        self._sData += sData
87
88    def handle_charref(self, sName): pass
89    def handle_entityref(self, sName): pass
90
91
92def fetch_word(sUrl):
93    """Fetch the word from sUrl, returning the word and definition."""
94    oP = WordPageParser()
95    f = urllib2.urlopen(sUrl)
96    try:
97        oP.feed(f.read())
98    finally:
99        f.close()
100    return oP.get_word()
101
102def fetch_letter(sUrl):
103    """Fetch all the words from the letter page sUrl."""
104    oP = LetterPageParser(sUrl)
105    f = urllib2.urlopen(sUrl)
106    try:
107        oP.feed(f.read())
108    finally:
109        f.close()
110
111    for sWordUrl in oP.get_urls():
112        yield fetch_word(sWordUrl)
113
114def fetch_all(sUrl):
115    """Fetch all the words using sUrl as the base url."""
116    for sL in list("ABCDEFGH") + ["IJ"] + \
117              list("KLMNOPQRST") + ["UV"] + \
118              list("XYZ"):
119        for tWordDef in fetch_letter("%s/%s/" % (sUrl, sL)):
120            yield tWordDef
121
122if __name__ == "__main__":
123    import sys
124
125    sBaseUrl = "http://www.fromoldbooks.org/NathanBailey-CantingDictionary/"
126
127    for sWord, sDef in fetch_all(sBaseUrl):
128        sys.stdout.write(sWord)
129        sys.stdout.write("\n    ")
130        sys.stdout.write(sDef)
131        sys.stdout.write("\n\n")
132        sys.stdout.flush()
Note: See TracBrowser for help on using the browser.