root/hodgestar/PythonCode/Duophage/duophage.py

Revision 366, 7.1 kB (checked in by simon, 4 years ago)

Implement (somewhat fragile) libxml2 version of the HTML parser.

  • Property svn:mime-type set to text/python-source
  • Property svn:eol-style set to native
  • Property svn:executable set to *
Line 
1#!/usr/bin/python
2
3try:
4    from HTMLParser import HTMLParser
5    bHaveHTMLParser = True
6except ImportError:
7    HTMLParser = object
8    bHaveHTMLParser = False
9
10try:
11    from libxml2 import SAXCallback, createPushParser
12    bHaveLibxml2 = True
13except:
14    SAXCallback = object
15    bHaveLibxml2 = False
16
17import urllib, sys, base64
18import logging
19
20class DuoParserBase(object):
21    """Base class with helper methods for Duophage parsers.
22
23       Sub-classes must provide:
24         self._sBaseUrl : base URL for the document.
25         self._fOut : output file.
26       """
27
28    def _urlToAbsUrl(self,sUrl):
29        """
30        Convert a relative or absolute URL to an absolute URL.
31        """
32        return urllib.basejoin(self._sBaseUrl,sUrl)
33
34    def _urlToData(self,sUrl):
35        """
36        Convert any URL to an equivalent data: URL.
37        """
38        if sUrl.startswith('data:'): return sUrl
39
40        sUrl = self._urlToAbsUrl(sUrl)
41        fSrc = urllib.urlopen(sUrl)
42        sData = base64.standard_b64encode(fSrc.read())
43        oInfo = fSrc.info()
44        sMime = oInfo['Content-Type']
45
46        return 'data:' + sMime + ';base64,' + sData
47
48    def _handleStartTag(self,sTag,aAttrs,bEnd=False):
49        """
50        Handle a start tag.
51
52        If bEnd is True, this is also an end tag.
53        """
54        if aAttrs:
55            dAttrs = dict(aAttrs)
56        else:
57            dAttrs = {}
58
59        # Set a new base url if we encounter a base tag
60        if sTag == 'base' and dAttrs.has_key('href'):
61            self._sBaseUrl = dAttrs['href']
62
63        # Canonicalise non-data URLs
64        if dAttrs.has_key('href') and not dAttrs['href'].startswith('data:'):
65            dAttrs['href'] = self._urlToAbsUrl(dAttrs['href'])
66
67        # Convert src URLs to data
68        if dAttrs.has_key('src'):
69            dAttrs['src'] = self._urlToData(dAttrs['src'])
70
71        # Convert links to stylesheets to data           
72        if sTag == 'link' and dAttrs.get('rel','').lower() == 'stylesheet':
73            dAttrs['href'] = self._urlToData(dAttrs['href'])
74
75        # write out new tag           
76        self._fOut.write("<" + sTag)
77        for k,v in dAttrs.iteritems():
78            if v is None:
79                self._fOut.write(" " + k + "='" + k + "'")
80            else:
81                self._fOut.write(" " + k + "='" + v + "'")
82        if bEnd:
83            self._fOut.write("/>")
84        else:
85            self._fOut.write(">")
86
87
88class DuoParser(HTMLParser, DuoParserBase):
89    """
90    Duophage parser based on standard Python HTMLParser.
91    """
92    def __init__(self, fOut, sBaseUrl):
93        super(DuoParser, self).__init__()
94        self._fOut = fOut
95        self._sBaseUrl = sBaseUrl
96
97    def handle_starttag(self, sTag, aAttrs): self._handleStartTag(sTag, aAttrs, False)
98    def handle_startendtag(self, sTag, aAttrs): self._handleStartTag(sTag, aAttrs, True)
99    def handle_endtag(self, sTag): self._fOut.write("</" + sTag + ">")
100    def handle_data(self, sData): self._fOut.write(sData)
101    def handle_charref(self, sName): self._fOut.write("&" + sName + ";")
102    def handle_entityref(self, sName): self._fOut.write("&" + sName + ";")
103    def handle_comment(self, sData): self._fOut.write("<!--" + sData + "-->")
104    def handle_decl(self, sDecl): self._fOut.write("<!" + sDecl + ">")
105    def handle_pi(self, sData): self._fOut.write("<?" + sData + ">")
106
107
108class DuoLxmlParser(SAXCallback, DuoParserBase):
109    """
110    Duophage parser for libxml2.
111    """
112    def __init__(self,fOut,sBaseUrl):
113        super(DuoLxmlParser, self).__init__()
114        self._fOut = fOut
115        self._sBaseUrl = sBaseUrl
116
117    def startElement(self, sTag, aAttrs): self._handleStartTag(sTag, aAttrs, False)
118    def endElement(self, sTag): self._fOut.write("</" + sTag + ">")
119    def characters(self, sData): self._fOut.write(sData)
120
121    # def handle_charref(self, sName): self._fOut.write("&" + sName + ";")
122
123    def reference(self, sName): self._fOut.write("&" + sName + ";")
124    def comment(self, sData): self._fOut.write("<!--" + sData + "-->")
125
126    # def handle_decl(self, sDecl): self._fOut.write("<!" + sDecl + ">")
127    # def handle_pi(self, sData): self._fOut.write("<?" + sData + ">")
128
129    def error(self, msg): logging.warn(msg)
130
131    # Other methods which might be worth implementing:
132    #
133    #  attributeDecl(self, elem, name, type, defi, defaultValue, nameList)
134    #      called when an ATTRIBUTE definition has been found
135    #
136    #  cdataBlock(self, data)
137    #      called when CDATA section have been read, data is the string
138    #      containing the data, multiple consecutive cdataBlock() callback
139    #      are possible.
140    #
141    #  elementDecl(self, name, type, content)
142    #      called when an ELEMENT definition has been found
143    #
144    #  entityDecl(self, name, publicId, systemID, notationName)
145    #      called when an unparsed ENTITY declaration has been found,
146    #      name is the entity name and publicId,, systemID are the entity
147    #      public and system identifier for that entity if available,
148    #      and notationName indicate the associated NOTATION
149    #
150    #  externalSubset(self, name, externalID, systemID)
151    #      called when a DOCTYPE declaration has been found, name is the
152    #      DTD name and externalID, systemID are the DTD public and system
153    #      identifier for that DTd if available
154    #
155    #  ignorableWhitespace(self, data)
156    #      called when potentially ignorable white spaces have been found
157    # 
158    # internalSubset(self, name, externalID, systemID)
159    #      called when a DOCTYPE declaration has been found, name is the
160    #      DTD name and externalID, systemID are the DTD public and system
161    #      identifier for that DTD if available
162    # 
163    #  notationDecl(self, name, externalID, systemID)
164    #      called when an NOTATION declaration has been found, name is the
165    #      notation name and externalID, systemID are the notation public and
166    #      system identifier for that notation if available
167    # 
168    #  processingInstruction(self, target, data)
169    #      called when a PI has been found, target contains the PI name and
170    #      data is the associated data in the PI
171    #
172    #  error(self, msg)
173
174def main(aArgs):
175    if len(aArgs) != 2:
176        print aArgs[0], "'http://example/foo'", "> output.html"
177        print "  ---"
178        print "  Duophage makes HTML pages self-contained by looking for"
179        print "  src='...' attributes, downloading the data and encoding"
180        print "  it into a 'data:...' URI."
181        return 1
182
183    sUrl = aArgs[1]
184
185    # Current the libxml2 SAX parser is looking even less robust
186    # against bad HTML than the stock HTMLParser :/
187    #   if bHaveLibxml2:
188    #        oH = DuoLxmlParser(sys.stdout, sUrl)
189    #        oP = createPushParser(oH, "", 0, sUrl)
190    #        for s in urllib.urlopen(sUrl):
191    #            oP.parseChunk(s, len(s), 0)
192    #        oP.parseChunk("", 0, 1)
193
194    if bHaveHTMLParser:
195        oP = DuoParser(sys.stdout, sUrl)
196        for s in urllib.urlopen(sUrl):
197            oP.feed(s)
198
199    else:
200        print "No parsers found (tried HTMLParser and libxml2)."
201        return 2
202
203    return 0
204
205if __name__ == "__main__":
206    sys.exit(main(sys.argv))
Note: See TracBrowser for help on using the browser.