| 1 | #!/usr/bin/python |
|---|
| 2 | |
|---|
| 3 | try: |
|---|
| 4 | from HTMLParser import HTMLParser |
|---|
| 5 | bHaveHTMLParser = True |
|---|
| 6 | except ImportError: |
|---|
| 7 | HTMLParser = object |
|---|
| 8 | bHaveHTMLParser = False |
|---|
| 9 | |
|---|
| 10 | try: |
|---|
| 11 | from libxml2 import SAXCallback, createPushParser |
|---|
| 12 | bHaveLibxml2 = True |
|---|
| 13 | except: |
|---|
| 14 | SAXCallback = object |
|---|
| 15 | bHaveLibxml2 = False |
|---|
| 16 | |
|---|
| 17 | import urllib, sys, base64 |
|---|
| 18 | import logging |
|---|
| 19 | |
|---|
| 20 | class DuoParserBase(object): |
|---|
| 21 | """Base class with helper methods for Duophage parsers. |
|---|
| 22 | |
|---|
| 23 | Sub-classes must provide: |
|---|
| 24 | self._sBaseUrl : base URL for the document. |
|---|
| 25 | self._fOut : output file. |
|---|
| 26 | """ |
|---|
| 27 | |
|---|
| 28 | def _urlToAbsUrl(self,sUrl): |
|---|
| 29 | """ |
|---|
| 30 | Convert a relative or absolute URL to an absolute URL. |
|---|
| 31 | """ |
|---|
| 32 | return urllib.basejoin(self._sBaseUrl,sUrl) |
|---|
| 33 | |
|---|
| 34 | def _urlToData(self,sUrl): |
|---|
| 35 | """ |
|---|
| 36 | Convert any URL to an equivalent data: URL. |
|---|
| 37 | """ |
|---|
| 38 | if sUrl.startswith('data:'): return sUrl |
|---|
| 39 | |
|---|
| 40 | sUrl = self._urlToAbsUrl(sUrl) |
|---|
| 41 | fSrc = urllib.urlopen(sUrl) |
|---|
| 42 | sData = base64.standard_b64encode(fSrc.read()) |
|---|
| 43 | oInfo = fSrc.info() |
|---|
| 44 | sMime = oInfo['Content-Type'] |
|---|
| 45 | |
|---|
| 46 | return 'data:' + sMime + ';base64,' + sData |
|---|
| 47 | |
|---|
| 48 | def _handleStartTag(self,sTag,aAttrs,bEnd=False): |
|---|
| 49 | """ |
|---|
| 50 | Handle a start tag. |
|---|
| 51 | |
|---|
| 52 | If bEnd is True, this is also an end tag. |
|---|
| 53 | """ |
|---|
| 54 | if aAttrs: |
|---|
| 55 | dAttrs = dict(aAttrs) |
|---|
| 56 | else: |
|---|
| 57 | dAttrs = {} |
|---|
| 58 | |
|---|
| 59 | # Set a new base url if we encounter a base tag |
|---|
| 60 | if sTag == 'base' and dAttrs.has_key('href'): |
|---|
| 61 | self._sBaseUrl = dAttrs['href'] |
|---|
| 62 | |
|---|
| 63 | # Canonicalise non-data URLs |
|---|
| 64 | if dAttrs.has_key('href') and not dAttrs['href'].startswith('data:'): |
|---|
| 65 | dAttrs['href'] = self._urlToAbsUrl(dAttrs['href']) |
|---|
| 66 | |
|---|
| 67 | # Convert src URLs to data |
|---|
| 68 | if dAttrs.has_key('src'): |
|---|
| 69 | dAttrs['src'] = self._urlToData(dAttrs['src']) |
|---|
| 70 | |
|---|
| 71 | # Convert links to stylesheets to data |
|---|
| 72 | if sTag == 'link' and dAttrs.get('rel','').lower() == 'stylesheet': |
|---|
| 73 | dAttrs['href'] = self._urlToData(dAttrs['href']) |
|---|
| 74 | |
|---|
| 75 | # write out new tag |
|---|
| 76 | self._fOut.write("<" + sTag) |
|---|
| 77 | for k,v in dAttrs.iteritems(): |
|---|
| 78 | if v is None: |
|---|
| 79 | self._fOut.write(" " + k + "='" + k + "'") |
|---|
| 80 | else: |
|---|
| 81 | self._fOut.write(" " + k + "='" + v + "'") |
|---|
| 82 | if bEnd: |
|---|
| 83 | self._fOut.write("/>") |
|---|
| 84 | else: |
|---|
| 85 | self._fOut.write(">") |
|---|
| 86 | |
|---|
| 87 | |
|---|
| 88 | class DuoParser(HTMLParser, DuoParserBase): |
|---|
| 89 | """ |
|---|
| 90 | Duophage parser based on standard Python HTMLParser. |
|---|
| 91 | """ |
|---|
| 92 | def __init__(self, fOut, sBaseUrl): |
|---|
| 93 | super(DuoParser, self).__init__() |
|---|
| 94 | self._fOut = fOut |
|---|
| 95 | self._sBaseUrl = sBaseUrl |
|---|
| 96 | |
|---|
| 97 | def handle_starttag(self, sTag, aAttrs): self._handleStartTag(sTag, aAttrs, False) |
|---|
| 98 | def handle_startendtag(self, sTag, aAttrs): self._handleStartTag(sTag, aAttrs, True) |
|---|
| 99 | def handle_endtag(self, sTag): self._fOut.write("</" + sTag + ">") |
|---|
| 100 | def handle_data(self, sData): self._fOut.write(sData) |
|---|
| 101 | def handle_charref(self, sName): self._fOut.write("&" + sName + ";") |
|---|
| 102 | def handle_entityref(self, sName): self._fOut.write("&" + sName + ";") |
|---|
| 103 | def handle_comment(self, sData): self._fOut.write("<!--" + sData + "-->") |
|---|
| 104 | def handle_decl(self, sDecl): self._fOut.write("<!" + sDecl + ">") |
|---|
| 105 | def handle_pi(self, sData): self._fOut.write("<?" + sData + ">") |
|---|
| 106 | |
|---|
| 107 | |
|---|
| 108 | class DuoLxmlParser(SAXCallback, DuoParserBase): |
|---|
| 109 | """ |
|---|
| 110 | Duophage parser for libxml2. |
|---|
| 111 | """ |
|---|
| 112 | def __init__(self,fOut,sBaseUrl): |
|---|
| 113 | super(DuoLxmlParser, self).__init__() |
|---|
| 114 | self._fOut = fOut |
|---|
| 115 | self._sBaseUrl = sBaseUrl |
|---|
| 116 | |
|---|
| 117 | def startElement(self, sTag, aAttrs): self._handleStartTag(sTag, aAttrs, False) |
|---|
| 118 | def endElement(self, sTag): self._fOut.write("</" + sTag + ">") |
|---|
| 119 | def characters(self, sData): self._fOut.write(sData) |
|---|
| 120 | |
|---|
| 121 | # def handle_charref(self, sName): self._fOut.write("&" + sName + ";") |
|---|
| 122 | |
|---|
| 123 | def reference(self, sName): self._fOut.write("&" + sName + ";") |
|---|
| 124 | def comment(self, sData): self._fOut.write("<!--" + sData + "-->") |
|---|
| 125 | |
|---|
| 126 | # def handle_decl(self, sDecl): self._fOut.write("<!" + sDecl + ">") |
|---|
| 127 | # def handle_pi(self, sData): self._fOut.write("<?" + sData + ">") |
|---|
| 128 | |
|---|
| 129 | def error(self, msg): logging.warn(msg) |
|---|
| 130 | |
|---|
| 131 | # Other methods which might be worth implementing: |
|---|
| 132 | # |
|---|
| 133 | # attributeDecl(self, elem, name, type, defi, defaultValue, nameList) |
|---|
| 134 | # called when an ATTRIBUTE definition has been found |
|---|
| 135 | # |
|---|
| 136 | # cdataBlock(self, data) |
|---|
| 137 | # called when CDATA section have been read, data is the string |
|---|
| 138 | # containing the data, multiple consecutive cdataBlock() callback |
|---|
| 139 | # are possible. |
|---|
| 140 | # |
|---|
| 141 | # elementDecl(self, name, type, content) |
|---|
| 142 | # called when an ELEMENT definition has been found |
|---|
| 143 | # |
|---|
| 144 | # entityDecl(self, name, publicId, systemID, notationName) |
|---|
| 145 | # called when an unparsed ENTITY declaration has been found, |
|---|
| 146 | # name is the entity name and publicId,, systemID are the entity |
|---|
| 147 | # public and system identifier for that entity if available, |
|---|
| 148 | # and notationName indicate the associated NOTATION |
|---|
| 149 | # |
|---|
| 150 | # externalSubset(self, name, externalID, systemID) |
|---|
| 151 | # called when a DOCTYPE declaration has been found, name is the |
|---|
| 152 | # DTD name and externalID, systemID are the DTD public and system |
|---|
| 153 | # identifier for that DTd if available |
|---|
| 154 | # |
|---|
| 155 | # ignorableWhitespace(self, data) |
|---|
| 156 | # called when potentially ignorable white spaces have been found |
|---|
| 157 | # |
|---|
| 158 | # internalSubset(self, name, externalID, systemID) |
|---|
| 159 | # called when a DOCTYPE declaration has been found, name is the |
|---|
| 160 | # DTD name and externalID, systemID are the DTD public and system |
|---|
| 161 | # identifier for that DTD if available |
|---|
| 162 | # |
|---|
| 163 | # notationDecl(self, name, externalID, systemID) |
|---|
| 164 | # called when an NOTATION declaration has been found, name is the |
|---|
| 165 | # notation name and externalID, systemID are the notation public and |
|---|
| 166 | # system identifier for that notation if available |
|---|
| 167 | # |
|---|
| 168 | # processingInstruction(self, target, data) |
|---|
| 169 | # called when a PI has been found, target contains the PI name and |
|---|
| 170 | # data is the associated data in the PI |
|---|
| 171 | # |
|---|
| 172 | # error(self, msg) |
|---|
| 173 | |
|---|
| 174 | def main(aArgs): |
|---|
| 175 | if len(aArgs) != 2: |
|---|
| 176 | print aArgs[0], "'http://example/foo'", "> output.html" |
|---|
| 177 | print " ---" |
|---|
| 178 | print " Duophage makes HTML pages self-contained by looking for" |
|---|
| 179 | print " src='...' attributes, downloading the data and encoding" |
|---|
| 180 | print " it into a 'data:...' URI." |
|---|
| 181 | return 1 |
|---|
| 182 | |
|---|
| 183 | sUrl = aArgs[1] |
|---|
| 184 | |
|---|
| 185 | # Current the libxml2 SAX parser is looking even less robust |
|---|
| 186 | # against bad HTML than the stock HTMLParser :/ |
|---|
| 187 | # if bHaveLibxml2: |
|---|
| 188 | # oH = DuoLxmlParser(sys.stdout, sUrl) |
|---|
| 189 | # oP = createPushParser(oH, "", 0, sUrl) |
|---|
| 190 | # for s in urllib.urlopen(sUrl): |
|---|
| 191 | # oP.parseChunk(s, len(s), 0) |
|---|
| 192 | # oP.parseChunk("", 0, 1) |
|---|
| 193 | |
|---|
| 194 | if bHaveHTMLParser: |
|---|
| 195 | oP = DuoParser(sys.stdout, sUrl) |
|---|
| 196 | for s in urllib.urlopen(sUrl): |
|---|
| 197 | oP.feed(s) |
|---|
| 198 | |
|---|
| 199 | else: |
|---|
| 200 | print "No parsers found (tried HTMLParser and libxml2)." |
|---|
| 201 | return 2 |
|---|
| 202 | |
|---|
| 203 | return 0 |
|---|
| 204 | |
|---|
| 205 | if __name__ == "__main__": |
|---|
| 206 | sys.exit(main(sys.argv)) |
|---|