source: subversion/applications/routing/pyroute/feedparser.py @ 5853

Last change on this file since 5853 was 5495, checked in by ojw, 12 years ago

Changes from the weekend

File size: 120.1 KB
Line 
1#!/usr/bin/env python
2"""Universal feed parser
3
4Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
5
6Visit http://feedparser.org/ for the latest version
7Visit http://feedparser.org/docs/ for the latest documentation
8
9Required: Python 2.1 or later
10Recommended: Python 2.3 or later
11Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
12"""
13
14__version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs"
15__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
16
17Redistribution and use in source and binary forms, with or without modification,
18are permitted provided that the following conditions are met:
19
20* Redistributions of source code must retain the above copyright notice,
21  this list of conditions and the following disclaimer.
22* Redistributions in binary form must reproduce the above copyright notice,
23  this list of conditions and the following disclaimer in the documentation
24  and/or other materials provided with the distribution.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE."""
37__author__ = "Mark Pilgrim <http://diveintomark.org/>"
38__contributors__ = ["Jason Diamond <http://injektilo.org/>",
39                    "John Beimler <http://john.beimler.org/>",
40                    "Fazal Majid <http://www.majid.info/mylos/weblog/>",
41                    "Aaron Swartz <http://aaronsw.com/>",
42                    "Kevin Marks <http://epeus.blogspot.com/>"]
43_debug = 0
44
45# HTTP "User-Agent" header to send to servers when downloading feeds.
46# If you are embedding feedparser in a larger application, you should
47# change this to your application name and URL.
48USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
49
50# HTTP "Accept" header to send to servers when downloading feeds.  If you don't
51# want to send an Accept header, set this to None.
52ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
53
54# List of preferred XML parsers, by SAX driver name.  These will be tried first,
55# but if they're not installed, Python will keep searching through its own list
56# of pre-installed parsers until it finds one that supports everything we need.
57PREFERRED_XML_PARSERS = ["drv_libxml2"]
58
59# If you want feedparser to automatically run HTML markup through HTML Tidy, set
60# this to 1.  Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
61# or utidylib <http://utidylib.berlios.de/>.
62TIDY_MARKUP = 0
63
64# List of Python interfaces for HTML Tidy, in order of preference.  Only useful
65# if TIDY_MARKUP = 1
66PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
67
68# ---------- required modules (should come with any Python distribution) ----------
69import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
70try:
71    from cStringIO import StringIO as _StringIO
72except:
73    from StringIO import StringIO as _StringIO
74
75# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
76
77# gzip is included with most Python distributions, but may not be available if you compiled your own
78try:
79    import gzip
80except:
81    gzip = None
82try:
83    import zlib
84except:
85    zlib = None
86
87# If a real XML parser is available, feedparser will attempt to use it.  feedparser has
88# been tested with the built-in SAX parser, PyXML, and libxml2.  On platforms where the
89# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
90# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
91try:
92    import xml.sax
93    xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
94    from xml.sax.saxutils import escape as _xmlescape
95    _XML_AVAILABLE = 1
96except:
97    _XML_AVAILABLE = 0
98    def _xmlescape(data):
99        data = data.replace('&', '&amp;')
100        data = data.replace('>', '&gt;')
101        data = data.replace('<', '&lt;')
102        return data
103
104# base64 support for Atom feeds that contain embedded binary data
105try:
106    import base64, binascii
107except:
108    base64 = binascii = None
109
110# cjkcodecs and iconv_codec provide support for more character encodings.
111# Both are available from http://cjkpython.i18n.org/
112try:
113    import cjkcodecs.aliases
114except:
115    pass
116try:
117    import iconv_codec
118except:
119    pass
120
121# chardet library auto-detects character encodings
122# Download from http://chardet.feedparser.org/
123try:
124    import chardet
125    if _debug:
126        import chardet.constants
127        chardet.constants._debug = 1
128except:
129    chardet = None
130
131# ---------- don't touch these ----------
132class ThingsNobodyCaresAboutButMe(Exception): pass
133class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
134class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
135class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
136class UndeclaredNamespace(Exception): pass
137
138sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
139sgmllib.special = re.compile('<!')
140sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
141
142SUPPORTED_VERSIONS = {'': 'unknown',
143                      'rss090': 'RSS 0.90',
144                      'rss091n': 'RSS 0.91 (Netscape)',
145                      'rss091u': 'RSS 0.91 (Userland)',
146                      'rss092': 'RSS 0.92',
147                      'rss093': 'RSS 0.93',
148                      'rss094': 'RSS 0.94',
149                      'rss20': 'RSS 2.0',
150                      'rss10': 'RSS 1.0',
151                      'rss': 'RSS (unknown version)',
152                      'atom01': 'Atom 0.1',
153                      'atom02': 'Atom 0.2',
154                      'atom03': 'Atom 0.3',
155                      'atom10': 'Atom 1.0',
156                      'atom': 'Atom (unknown version)',
157                      'cdf': 'CDF',
158                      'hotrss': 'Hot RSS'
159                      }
160
161try:
162    UserDict = dict
163except NameError:
164    # Python 2.1 does not have dict
165    from UserDict import UserDict
166    def dict(aList):
167        rc = {}
168        for k, v in aList:
169            rc[k] = v
170        return rc
171
172class FeedParserDict(UserDict):
173    keymap = {'channel': 'feed',
174              'items': 'entries',
175              'guid': 'id',
176              'date': 'updated',
177              'date_parsed': 'updated_parsed',
178              'description': ['subtitle', 'summary'],
179              'url': ['href'],
180              'modified': 'updated',
181              'modified_parsed': 'updated_parsed',
182              'issued': 'published',
183              'issued_parsed': 'published_parsed',
184              'copyright': 'rights',
185              'copyright_detail': 'rights_detail',
186              'tagline': 'subtitle',
187              'tagline_detail': 'subtitle_detail'}
188    def __getitem__(self, key):
189        if key == 'category':
190            return UserDict.__getitem__(self, 'tags')[0]['term']
191        if key == 'categories':
192            return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
193        realkey = self.keymap.get(key, key)
194        if type(realkey) == types.ListType:
195            for k in realkey:
196                if UserDict.has_key(self, k):
197                    return UserDict.__getitem__(self, k)
198        if UserDict.has_key(self, key):
199            return UserDict.__getitem__(self, key)
200        return UserDict.__getitem__(self, realkey)
201
202    def __setitem__(self, key, value):
203        for k in self.keymap.keys():
204            if key == k:
205                key = self.keymap[k]
206                if type(key) == types.ListType:
207                    key = key[0]
208        return UserDict.__setitem__(self, key, value)
209
210    def get(self, key, default=None):
211        if self.has_key(key):
212            return self[key]
213        else:
214            return default
215
216    def setdefault(self, key, value):
217        if not self.has_key(key):
218            self[key] = value
219        return self[key]
220       
221    def has_key(self, key):
222        try:
223            return hasattr(self, key) or UserDict.has_key(self, key)
224        except AttributeError:
225            return False
226       
227    def __getattr__(self, key):
228        try:
229            return self.__dict__[key]
230        except KeyError:
231            pass
232        try:
233            assert not key.startswith('_')
234            return self.__getitem__(key)
235        except:
236            raise AttributeError, "object has no attribute '%s'" % key
237
238    def __setattr__(self, key, value):
239        if key.startswith('_') or key == 'data':
240            self.__dict__[key] = value
241        else:
242            return self.__setitem__(key, value)
243
244    def __contains__(self, key):
245        return self.has_key(key)
246
247def zopeCompatibilityHack():
248    global FeedParserDict
249    del FeedParserDict
250    def FeedParserDict(aDict=None):
251        rc = {}
252        if aDict:
253            rc.update(aDict)
254        return rc
255
256_ebcdic_to_ascii_map = None
257def _ebcdic_to_ascii(s):
258    global _ebcdic_to_ascii_map
259    if not _ebcdic_to_ascii_map:
260        emap = (
261            0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
262            16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
263            128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
264            144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
265            32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
266            38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
267            45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
268            186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
269            195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
270            202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
271            209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
272            216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
273            123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
274            125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
275            92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
276            48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
277            )
278        import string
279        _ebcdic_to_ascii_map = string.maketrans( \
280            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
281    return s.translate(_ebcdic_to_ascii_map)
282
283_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
284def _urljoin(base, uri):
285    uri = _urifixer.sub(r'\1\3', uri)
286    return urlparse.urljoin(base, uri)
287
288class _FeedParserMixin:
289    namespaces = {'': '',
290                  'http://backend.userland.com/rss': '',
291                  'http://blogs.law.harvard.edu/tech/rss': '',
292                  'http://purl.org/rss/1.0/': '',
293                  'http://my.netscape.com/rdf/simple/0.9/': '',
294                  'http://example.com/newformat#': '',
295                  'http://example.com/necho': '',
296                  'http://purl.org/echo/': '',
297                  'uri/of/echo/namespace#': '',
298                  'http://purl.org/pie/': '',
299                  'http://purl.org/atom/ns#': '',
300                  'http://www.w3.org/2005/Atom': '',
301                  'http://purl.org/rss/1.0/modules/rss091#': '',
302                 
303                  'http://webns.net/mvcb/':                               'admin',
304                  'http://purl.org/rss/1.0/modules/aggregation/':         'ag',
305                  'http://purl.org/rss/1.0/modules/annotate/':            'annotate',
306                  'http://media.tangent.org/rss/1.0/':                    'audio',
307                  'http://backend.userland.com/blogChannelModule':        'blogChannel',
308                  'http://web.resource.org/cc/':                          'cc',
309                  'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
310                  'http://purl.org/rss/1.0/modules/company':              'co',
311                  'http://purl.org/rss/1.0/modules/content/':             'content',
312                  'http://my.theinfo.org/changed/1.0/rss/':               'cp',
313                  'http://purl.org/dc/elements/1.1/':                     'dc',
314                  'http://purl.org/dc/terms/':                            'dcterms',
315                  'http://purl.org/rss/1.0/modules/email/':               'email',
316                  'http://purl.org/rss/1.0/modules/event/':               'ev',
317                  'http://rssnamespace.org/feedburner/ext/1.0':           'feedburner',
318                  'http://freshmeat.net/rss/fm/':                         'fm',
319                  'http://xmlns.com/foaf/0.1/':                           'foaf',
320                  'http://www.w3.org/2003/01/geo/wgs84_pos#':             'geo',
321                  'http://postneo.com/icbm/':                             'icbm',
322                  'http://purl.org/rss/1.0/modules/image/':               'image',
323                  'http://www.itunes.com/DTDs/PodCast-1.0.dtd':           'itunes',
324                  'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
325                  'http://purl.org/rss/1.0/modules/link/':                'l',
326                  'http://search.yahoo.com/mrss':                         'media',
327                  'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
328                  'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
329                  'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
330                  'http://www.w3.org/2000/01/rdf-schema#':                'rdfs',
331                  'http://purl.org/rss/1.0/modules/reference/':           'ref',
332                  'http://purl.org/rss/1.0/modules/richequiv/':           'reqv',
333                  'http://purl.org/rss/1.0/modules/search/':              'search',
334                  'http://purl.org/rss/1.0/modules/slash/':               'slash',
335                  'http://schemas.xmlsoap.org/soap/envelope/':            'soap',
336                  'http://purl.org/rss/1.0/modules/servicestatus/':       'ss',
337                  'http://hacks.benhammersley.com/rss/streaming/':        'str',
338                  'http://purl.org/rss/1.0/modules/subscription/':        'sub',
339                  'http://purl.org/rss/1.0/modules/syndication/':         'sy',
340                  'http://purl.org/rss/1.0/modules/taxonomy/':            'taxo',
341                  'http://purl.org/rss/1.0/modules/threading/':           'thr',
342                  'http://purl.org/rss/1.0/modules/textinput/':           'ti',
343                  'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
344                  'http://wellformedweb.org/commentAPI/':                 'wfw',
345                  'http://purl.org/rss/1.0/modules/wiki/':                'wiki',
346                  'http://www.w3.org/1999/xhtml':                         'xhtml',
347                  'http://www.w3.org/XML/1998/namespace':                 'xml',
348                  'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf'
349}
350    _matchnamespaces = {}
351
352    can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
353    can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
354    can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
355    html_types = ['text/html', 'application/xhtml+xml']
356   
357    def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
358        if _debug: sys.stderr.write('initializing FeedParser\n')
359        if not self._matchnamespaces:
360            for k, v in self.namespaces.items():
361                self._matchnamespaces[k.lower()] = v
362        self.feeddata = FeedParserDict() # feed-level data
363        self.encoding = encoding # character encoding
364        self.entries = [] # list of entry-level data
365        self.version = '' # feed type/version, see SUPPORTED_VERSIONS
366        self.namespacesInUse = {} # dictionary of namespaces defined by the feed
367
368        # the following are used internally to track state;
369        # this is really out of control and should be refactored
370        self.infeed = 0
371        self.inentry = 0
372        self.incontent = 0
373        self.intextinput = 0
374        self.inimage = 0
375        self.inauthor = 0
376        self.incontributor = 0
377        self.inpublisher = 0
378        self.insource = 0
379        self.sourcedata = FeedParserDict()
380        self.contentparams = FeedParserDict()
381        self._summaryKey = None
382        self.namespacemap = {}
383        self.elementstack = []
384        self.basestack = []
385        self.langstack = []
386        self.baseuri = baseuri or ''
387        self.lang = baselang or None
388        if baselang:
389            self.feeddata['language'] = baselang
390
391    def unknown_starttag(self, tag, attrs):
392        if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
393        # normalize attrs
394        attrs = [(k.lower(), v) for k, v in attrs]
395        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
396       
397        # track xml:base and xml:lang
398        attrsD = dict(attrs)
399        baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
400        self.baseuri = _urljoin(self.baseuri, baseuri)
401        lang = attrsD.get('xml:lang', attrsD.get('lang'))
402        if lang == '':
403            # xml:lang could be explicitly set to '', we need to capture that
404            lang = None
405        elif lang is None:
406            # if no xml:lang is specified, use parent lang
407            lang = self.lang
408        if lang:
409            if tag in ('feed', 'rss', 'rdf:RDF'):
410                self.feeddata['language'] = lang
411        self.lang = lang
412        self.basestack.append(self.baseuri)
413        self.langstack.append(lang)
414       
415        # track namespaces
416        for prefix, uri in attrs:
417            if prefix.startswith('xmlns:'):
418                self.trackNamespace(prefix[6:], uri)
419            elif prefix == 'xmlns':
420                self.trackNamespace(None, uri)
421
422        # track inline content
423        if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
424            # element declared itself as escaped markup, but it isn't really
425            self.contentparams['type'] = 'application/xhtml+xml'
426        if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
427            # Note: probably shouldn't simply recreate localname here, but
428            # our namespace handling isn't actually 100% correct in cases where
429            # the feed redefines the default namespace (which is actually
430            # the usual case for inline content, thanks Sam), so here we
431            # cheat and just reconstruct the element based on localname
432            # because that compensates for the bugs in our namespace handling.
433            # This will horribly munge inline content with non-empty qnames,
434            # but nobody actually does that, so I'm not fixing it.
435            tag = tag.split(':')[-1]
436            return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
437
438        # match namespaces
439        if tag.find(':') <> -1:
440            prefix, suffix = tag.split(':', 1)
441        else:
442            prefix, suffix = '', tag
443        prefix = self.namespacemap.get(prefix, prefix)
444        if prefix:
445            prefix = prefix + '_'
446
447        # special hack for better tracking of empty textinput/image elements in illformed feeds
448        if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
449            self.intextinput = 0
450        if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
451            self.inimage = 0
452       
453        # call special handler (if defined) or default handler
454        methodname = '_start_' + prefix + suffix
455        try:
456            method = getattr(self, methodname)
457            return method(attrsD)
458        except AttributeError:
459            return self.push(prefix + suffix, 1)
460
461    def unknown_endtag(self, tag):
462        if _debug: sys.stderr.write('end %s\n' % tag)
463        # match namespaces
464        if tag.find(':') <> -1:
465            prefix, suffix = tag.split(':', 1)
466        else:
467            prefix, suffix = '', tag
468        prefix = self.namespacemap.get(prefix, prefix)
469        if prefix:
470            prefix = prefix + '_'
471
472        # call special handler (if defined) or default handler
473        methodname = '_end_' + prefix + suffix
474        try:
475            method = getattr(self, methodname)
476            method()
477        except AttributeError:
478            self.pop(prefix + suffix)
479
480        # track inline content
481        if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
482            # element declared itself as escaped markup, but it isn't really
483            self.contentparams['type'] = 'application/xhtml+xml'
484        if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
485            tag = tag.split(':')[-1]
486            self.handle_data('</%s>' % tag, escape=0)
487
488        # track xml:base and xml:lang going out of scope
489        if self.basestack:
490            self.basestack.pop()
491            if self.basestack and self.basestack[-1]:
492                self.baseuri = self.basestack[-1]
493        if self.langstack:
494            self.langstack.pop()
495            if self.langstack: # and (self.langstack[-1] is not None):
496                self.lang = self.langstack[-1]
497
498    def handle_charref(self, ref):
499        # called for each character reference, e.g. for '&#160;', ref will be '160'
500        if not self.elementstack: return
501        ref = ref.lower()
502        if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
503            text = '&#%s;' % ref
504        else:
505            if ref[0] == 'x':
506                c = int(ref[1:], 16)
507            else:
508                c = int(ref)
509            text = unichr(c).encode('utf-8')
510        self.elementstack[-1][2].append(text)
511
512    def handle_entityref(self, ref):
513        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
514        if not self.elementstack: return
515        if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
516        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
517            text = '&%s;' % ref
518        else:
519            # entity resolution graciously donated by Aaron Swartz
520            def name2cp(k):
521                import htmlentitydefs
522                if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
523                    return htmlentitydefs.name2codepoint[k]
524                k = htmlentitydefs.entitydefs[k]
525                if k.startswith('&#') and k.endswith(';'):
526                    return int(k[2:-1]) # not in latin-1
527                return ord(k)
528            try: name2cp(ref)
529            except KeyError: text = '&%s;' % ref
530            else: text = unichr(name2cp(ref)).encode('utf-8')
531        self.elementstack[-1][2].append(text)
532
533    def handle_data(self, text, escape=1):
534        # called for each block of plain text, i.e. outside of any tag and
535        # not containing any character or entity references
536        if not self.elementstack: return
537        if escape and self.contentparams.get('type') == 'application/xhtml+xml':
538            text = _xmlescape(text)
539        self.elementstack[-1][2].append(text)
540
541    def handle_comment(self, text):
542        # called for each comment, e.g. <!-- insert message here -->
543        pass
544
545    def handle_pi(self, text):
546        # called for each processing instruction, e.g. <?instruction>
547        pass
548
549    def handle_decl(self, text):
550        pass
551
552    def parse_declaration(self, i):
553        # override internal declaration handler to handle CDATA blocks
554        if _debug: sys.stderr.write('entering parse_declaration\n')
555        if self.rawdata[i:i+9] == '<![CDATA[':
556            k = self.rawdata.find(']]>', i)
557            if k == -1: k = len(self.rawdata)
558            self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
559            return k+3
560        else:
561            k = self.rawdata.find('>', i)
562            return k+1
563
564    def mapContentType(self, contentType):
565        contentType = contentType.lower()
566        if contentType == 'text':
567            contentType = 'text/plain'
568        elif contentType == 'html':
569            contentType = 'text/html'
570        elif contentType == 'xhtml':
571            contentType = 'application/xhtml+xml'
572        return contentType
573   
574    def trackNamespace(self, prefix, uri):
575        loweruri = uri.lower()
576        if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
577            self.version = 'rss090'
578        if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
579            self.version = 'rss10'
580        if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
581            self.version = 'atom10'
582        if loweruri.find('backend.userland.com/rss') <> -1:
583            # match any backend.userland.com namespace
584            uri = 'http://backend.userland.com/rss'
585            loweruri = uri
586        if self._matchnamespaces.has_key(loweruri):
587            self.namespacemap[prefix] = self._matchnamespaces[loweruri]
588            self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
589        else:
590            self.namespacesInUse[prefix or ''] = uri
591
592    def resolveURI(self, uri):
593        return _urljoin(self.baseuri or '', uri)
594   
595    def decodeEntities(self, element, data):
596        return data
597
598    def push(self, element, expectingText):
599        self.elementstack.append([element, expectingText, []])
600
601    def pop(self, element, stripWhitespace=1):
602        if not self.elementstack: return
603        if self.elementstack[-1][0] != element: return
604       
605        element, expectingText, pieces = self.elementstack.pop()
606        output = ''.join(pieces)
607        if stripWhitespace:
608            output = output.strip()
609        if not expectingText: return output
610
611        # decode base64 content
612        if base64 and self.contentparams.get('base64', 0):
613            try:
614                output = base64.decodestring(output)
615            except binascii.Error:
616                pass
617            except binascii.Incomplete:
618                pass
619               
620        # resolve relative URIs
621        if (element in self.can_be_relative_uri) and output:
622            output = self.resolveURI(output)
623       
624        # decode entities within embedded markup
625        if not self.contentparams.get('base64', 0):
626            output = self.decodeEntities(element, output)
627
628        # remove temporary cruft from contentparams
629        try:
630            del self.contentparams['mode']
631        except KeyError:
632            pass
633        try:
634            del self.contentparams['base64']
635        except KeyError:
636            pass
637
638        # resolve relative URIs within embedded markup
639        if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
640            if element in self.can_contain_relative_uris:
641                output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
642       
643        # sanitize embedded markup
644        if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
645            if element in self.can_contain_dangerous_markup:
646                output = _sanitizeHTML(output, self.encoding)
647
648        if self.encoding and type(output) != type(u''):
649            try:
650                output = unicode(output, self.encoding)
651            except:
652                pass
653
654        # categories/tags/keywords/whatever are handled in _end_category
655        if element == 'category':
656            return output
657       
658        # store output in appropriate place(s)
659        if self.inentry and not self.insource:
660            if element == 'content':
661                self.entries[-1].setdefault(element, [])
662                contentparams = copy.deepcopy(self.contentparams)
663                contentparams['value'] = output
664                self.entries[-1][element].append(contentparams)
665            elif element == 'link':
666                self.entries[-1][element] = output
667                if output:
668                    self.entries[-1]['links'][-1]['href'] = output
669            else:
670                if element == 'description':
671                    element = 'summary'
672                self.entries[-1][element] = output
673                if self.incontent:
674                    contentparams = copy.deepcopy(self.contentparams)
675                    contentparams['value'] = output
676                    self.entries[-1][element + '_detail'] = contentparams
677        elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):
678            context = self._getContext()
679            if element == 'description':
680                element = 'subtitle'
681            context[element] = output
682            if element == 'link':
683                context['links'][-1]['href'] = output
684            elif self.incontent:
685                contentparams = copy.deepcopy(self.contentparams)
686                contentparams['value'] = output
687                context[element + '_detail'] = contentparams
688        return output
689
690    def pushContent(self, tag, attrsD, defaultContentType, expectingText):
691        self.incontent += 1
692        self.contentparams = FeedParserDict({
693            'type': self.mapContentType(attrsD.get('type', defaultContentType)),
694            'language': self.lang,
695            'base': self.baseuri})
696        self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
697        self.push(tag, expectingText)
698
699    def popContent(self, tag):
700        value = self.pop(tag)
701        self.incontent -= 1
702        self.contentparams.clear()
703        return value
704       
705    def _mapToStandardPrefix(self, name):
706        colonpos = name.find(':')
707        if colonpos <> -1:
708            prefix = name[:colonpos]
709            suffix = name[colonpos+1:]
710            prefix = self.namespacemap.get(prefix, prefix)
711            name = prefix + ':' + suffix
712        return name
713       
714    def _getAttribute(self, attrsD, name):
715        return attrsD.get(self._mapToStandardPrefix(name))
716
717    def _isBase64(self, attrsD, contentparams):
718        if attrsD.get('mode', '') == 'base64':
719            return 1
720        if self.contentparams['type'].startswith('text/'):
721            return 0
722        if self.contentparams['type'].endswith('+xml'):
723            return 0
724        if self.contentparams['type'].endswith('/xml'):
725            return 0
726        return 1
727
728    def _itsAnHrefDamnIt(self, attrsD):
729        href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
730        if href:
731            try:
732                del attrsD['url']
733            except KeyError:
734                pass
735            try:
736                del attrsD['uri']
737            except KeyError:
738                pass
739            attrsD['href'] = href
740        return attrsD
741   
742    def _save(self, key, value):
743        context = self._getContext()
744        context.setdefault(key, value)
745
746    def _start_rss(self, attrsD):
747        versionmap = {'0.91': 'rss091u',
748                      '0.92': 'rss092',
749                      '0.93': 'rss093',
750                      '0.94': 'rss094'}
751        if not self.version:
752            attr_version = attrsD.get('version', '')
753            version = versionmap.get(attr_version)
754            if version:
755                self.version = version
756            elif attr_version.startswith('2.'):
757                self.version = 'rss20'
758            else:
759                self.version = 'rss'
760   
761    def _start_dlhottitles(self, attrsD):
762        self.version = 'hotrss'
763
764    def _start_channel(self, attrsD):
765        self.infeed = 1
766        self._cdf_common(attrsD)
767    _start_feedinfo = _start_channel
768
769    def _cdf_common(self, attrsD):
770        if attrsD.has_key('lastmod'):
771            self._start_modified({})
772            self.elementstack[-1][-1] = attrsD['lastmod']
773            self._end_modified()
774        if attrsD.has_key('href'):
775            self._start_link({})
776            self.elementstack[-1][-1] = attrsD['href']
777            self._end_link()
778   
779    def _start_feed(self, attrsD):
780        self.infeed = 1
781        versionmap = {'0.1': 'atom01',
782                      '0.2': 'atom02',
783                      '0.3': 'atom03'}
784        if not self.version:
785            attr_version = attrsD.get('version')
786            version = versionmap.get(attr_version)
787            if version:
788                self.version = version
789            else:
790                self.version = 'atom'
791
792    def _end_channel(self):
793        self.infeed = 0
794    _end_feed = _end_channel
795   
796    def _start_image(self, attrsD):
797        self.inimage = 1
798        self.push('image', 0)
799        context = self._getContext()
800        context.setdefault('image', FeedParserDict())
801           
802    def _end_image(self):
803        self.pop('image')
804        self.inimage = 0
805
806    def _start_textinput(self, attrsD):
807        self.intextinput = 1
808        self.push('textinput', 0)
809        context = self._getContext()
810        context.setdefault('textinput', FeedParserDict())
811    _start_textInput = _start_textinput
812   
813    def _end_textinput(self):
814        self.pop('textinput')
815        self.intextinput = 0
816    _end_textInput = _end_textinput
817
818    def _start_author(self, attrsD):
819        self.inauthor = 1
820        self.push('author', 1)
821    _start_managingeditor = _start_author
822    _start_dc_author = _start_author
823    _start_dc_creator = _start_author
824    _start_itunes_author = _start_author
825
826    def _end_author(self):
827        self.pop('author')
828        self.inauthor = 0
829        self._sync_author_detail()
830    _end_managingeditor = _end_author
831    _end_dc_author = _end_author
832    _end_dc_creator = _end_author
833    _end_itunes_author = _end_author
834
835    def _start_itunes_owner(self, attrsD):
836        self.inpublisher = 1
837        self.push('publisher', 0)
838
839    def _end_itunes_owner(self):
840        self.pop('publisher')
841        self.inpublisher = 0
842        self._sync_author_detail('publisher')
843
844    def _start_contributor(self, attrsD):
845        self.incontributor = 1
846        context = self._getContext()
847        context.setdefault('contributors', [])
848        context['contributors'].append(FeedParserDict())
849        self.push('contributor', 0)
850
851    def _end_contributor(self):
852        self.pop('contributor')
853        self.incontributor = 0
854
855    def _start_dc_contributor(self, attrsD):
856        self.incontributor = 1
857        context = self._getContext()
858        context.setdefault('contributors', [])
859        context['contributors'].append(FeedParserDict())
860        self.push('name', 0)
861
862    def _end_dc_contributor(self):
863        self._end_name()
864        self.incontributor = 0
865
866    def _start_name(self, attrsD):
867        self.push('name', 0)
868    _start_itunes_name = _start_name
869
870    def _end_name(self):
871        value = self.pop('name')
872        if self.inpublisher:
873            self._save_author('name', value, 'publisher')
874        elif self.inauthor:
875            self._save_author('name', value)
876        elif self.incontributor:
877            self._save_contributor('name', value)
878        elif self.intextinput:
879            context = self._getContext()
880            context['textinput']['name'] = value
881    _end_itunes_name = _end_name
882
883    def _start_width(self, attrsD):
884        self.push('width', 0)
885
886    def _end_width(self):
887        value = self.pop('width')
888        try:
889            value = int(value)
890        except:
891            value = 0
892        if self.inimage:
893            context = self._getContext()
894            context['image']['width'] = value
895
896    def _start_height(self, attrsD):
897        self.push('height', 0)
898
899    def _end_height(self):
900        value = self.pop('height')
901        try:
902            value = int(value)
903        except:
904            value = 0
905        if self.inimage:
906            context = self._getContext()
907            context['image']['height'] = value
908
909    def _start_url(self, attrsD):
910        self.push('href', 1)
911    _start_homepage = _start_url
912    _start_uri = _start_url
913
914    def _end_url(self):
915        value = self.pop('href')
916        if self.inauthor:
917            self._save_author('href', value)
918        elif self.incontributor:
919            self._save_contributor('href', value)
920        elif self.inimage:
921            context = self._getContext()
922            context['image']['href'] = value
923        elif self.intextinput:
924            context = self._getContext()
925            context['textinput']['link'] = value
926    _end_homepage = _end_url
927    _end_uri = _end_url
928
929    def _start_email(self, attrsD):
930        self.push('email', 0)
931    _start_itunes_email = _start_email
932
933    def _end_email(self):
934        value = self.pop('email')
935        if self.inpublisher:
936            self._save_author('email', value, 'publisher')
937        elif self.inauthor:
938            self._save_author('email', value)
939        elif self.incontributor:
940            self._save_contributor('email', value)
941    _end_itunes_email = _end_email
942
943    def _getContext(self):
944        if self.insource:
945            context = self.sourcedata
946        elif self.inentry:
947            context = self.entries[-1]
948        else:
949            context = self.feeddata
950        return context
951
952    def _save_author(self, key, value, prefix='author'):
953        context = self._getContext()
954        context.setdefault(prefix + '_detail', FeedParserDict())
955        context[prefix + '_detail'][key] = value
956        self._sync_author_detail()
957
958    def _save_contributor(self, key, value):
959        context = self._getContext()
960        context.setdefault('contributors', [FeedParserDict()])
961        context['contributors'][-1][key] = value
962
963    def _sync_author_detail(self, key='author'):
964        context = self._getContext()
965        detail = context.get('%s_detail' % key)
966        if detail:
967            name = detail.get('name')
968            email = detail.get('email')
969            if name and email:
970                context[key] = '%s (%s)' % (name, email)
971            elif name:
972                context[key] = name
973            elif email:
974                context[key] = email
975        else:
976            author = context.get(key)
977            if not author: return
978            emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
979            if not emailmatch: return
980            email = emailmatch.group(0)
981            # probably a better way to do the following, but it passes all the tests
982            author = author.replace(email, '')
983            author = author.replace('()', '')
984            author = author.strip()
985            if author and (author[0] == '('):
986                author = author[1:]
987            if author and (author[-1] == ')'):
988                author = author[:-1]
989            author = author.strip()
990            context.setdefault('%s_detail' % key, FeedParserDict())
991            context['%s_detail' % key]['name'] = author
992            context['%s_detail' % key]['email'] = email
993
994    def _start_subtitle(self, attrsD):
995        self.pushContent('subtitle', attrsD, 'text/plain', 1)
996    _start_tagline = _start_subtitle
997    _start_itunes_subtitle = _start_subtitle
998
999    def _end_subtitle(self):
1000        self.popContent('subtitle')
1001    _end_tagline = _end_subtitle
1002    _end_itunes_subtitle = _end_subtitle
1003           
1004    def _start_rights(self, attrsD):
1005        self.pushContent('rights', attrsD, 'text/plain', 1)
1006    _start_dc_rights = _start_rights
1007    _start_copyright = _start_rights
1008
1009    def _end_rights(self):
1010        self.popContent('rights')
1011    _end_dc_rights = _end_rights
1012    _end_copyright = _end_rights
1013
1014    def _start_item(self, attrsD):
1015        self.entries.append(FeedParserDict())
1016        self.push('item', 0)
1017        self.inentry = 1
1018        self.guidislink = 0
1019        id = self._getAttribute(attrsD, 'rdf:about')
1020        if id:
1021            context = self._getContext()
1022            context['id'] = id
1023        self._cdf_common(attrsD)
1024    _start_entry = _start_item
1025    _start_product = _start_item
1026
1027    def _end_item(self):
1028        self.pop('item')
1029        self.inentry = 0
1030    _end_entry = _end_item
1031
1032    def _start_dc_language(self, attrsD):
1033        self.push('language', 1)
1034    _start_language = _start_dc_language
1035
1036    def _end_dc_language(self):
1037        self.lang = self.pop('language')
1038    _end_language = _end_dc_language
1039
1040    def _start_dc_publisher(self, attrsD):
1041        self.push('publisher', 1)
1042    _start_webmaster = _start_dc_publisher
1043
1044    def _end_dc_publisher(self):
1045        self.pop('publisher')
1046        self._sync_author_detail('publisher')
1047    _end_webmaster = _end_dc_publisher
1048
1049    def _start_published(self, attrsD):
1050        self.push('published', 1)
1051    _start_dcterms_issued = _start_published
1052    _start_issued = _start_published
1053
1054    def _end_published(self):
1055        value = self.pop('published')
1056        self._save('published_parsed', _parse_date(value))
1057    _end_dcterms_issued = _end_published
1058    _end_issued = _end_published
1059
1060    def _start_updated(self, attrsD):
1061        self.push('updated', 1)
1062    _start_modified = _start_updated
1063    _start_dcterms_modified = _start_updated
1064    _start_pubdate = _start_updated
1065    _start_dc_date = _start_updated
1066
1067    def _end_updated(self):
1068        value = self.pop('updated')
1069        parsed_value = _parse_date(value)
1070        self._save('updated_parsed', parsed_value)
1071    _end_modified = _end_updated
1072    _end_dcterms_modified = _end_updated
1073    _end_pubdate = _end_updated
1074    _end_dc_date = _end_updated
1075
1076    def _start_created(self, attrsD):
1077        self.push('created', 1)
1078    _start_dcterms_created = _start_created
1079
1080    def _end_created(self):
1081        value = self.pop('created')
1082        self._save('created_parsed', _parse_date(value))
1083    _end_dcterms_created = _end_created
1084
1085    def _start_expirationdate(self, attrsD):
1086        self.push('expired', 1)
1087
1088    def _end_expirationdate(self):
1089        self._save('expired_parsed', _parse_date(self.pop('expired')))
1090
1091    def _start_cc_license(self, attrsD):
1092        self.push('license', 1)
1093        value = self._getAttribute(attrsD, 'rdf:resource')
1094        if value:
1095            self.elementstack[-1][2].append(value)
1096        self.pop('license')
1097       
1098    def _start_creativecommons_license(self, attrsD):
1099        self.push('license', 1)
1100
1101    def _end_creativecommons_license(self):
1102        self.pop('license')
1103
1104    def _addTag(self, term, scheme, label):
1105        context = self._getContext()
1106        tags = context.setdefault('tags', [])
1107        if (not term) and (not scheme) and (not label): return
1108        value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1109        if value not in tags:
1110            tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))
1111
1112    def _start_category(self, attrsD):
1113        if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
1114        term = attrsD.get('term')
1115        scheme = attrsD.get('scheme', attrsD.get('domain'))
1116        label = attrsD.get('label')
1117        self._addTag(term, scheme, label)
1118        self.push('category', 1)
1119    _start_dc_subject = _start_category
1120    _start_keywords = _start_category
1121       
1122    def _end_itunes_keywords(self):
1123        for term in self.pop('itunes_keywords').split():
1124            self._addTag(term, 'http://www.itunes.com/', None)
1125       
1126    def _start_itunes_category(self, attrsD):
1127        self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
1128        self.push('category', 1)
1129       
1130    def _end_category(self):
1131        value = self.pop('category')
1132        if not value: return
1133        context = self._getContext()
1134        tags = context['tags']
1135        if value and len(tags) and not tags[-1]['term']:
1136            tags[-1]['term'] = value
1137        else:
1138            self._addTag(value, None, None)
1139    _end_dc_subject = _end_category
1140    _end_keywords = _end_category
1141    _end_itunes_category = _end_category
1142
1143    def _start_cloud(self, attrsD):
1144        self._getContext()['cloud'] = FeedParserDict(attrsD)
1145       
1146    def _start_link(self, attrsD):
1147        attrsD.setdefault('rel', 'alternate')
1148        attrsD.setdefault('type', 'text/html')
1149        attrsD = self._itsAnHrefDamnIt(attrsD)
1150        if attrsD.has_key('href'):
1151            attrsD['href'] = self.resolveURI(attrsD['href'])
1152        expectingText = self.infeed or self.inentry or self.insource
1153        context = self._getContext()
1154        context.setdefault('links', [])
1155        context['links'].append(FeedParserDict(attrsD))
1156        if attrsD['rel'] == 'enclosure':
1157            self._start_enclosure(attrsD)
1158        if attrsD.has_key('href'):
1159            expectingText = 0
1160            if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1161                context['link'] = attrsD['href']
1162        else:
1163            self.push('link', expectingText)
1164    _start_producturl = _start_link
1165
1166    def _end_link(self):
1167        value = self.pop('link')
1168        context = self._getContext()
1169        if self.intextinput:
1170            context['textinput']['link'] = value
1171        if self.inimage:
1172            context['image']['link'] = value
1173    _end_producturl = _end_link
1174
1175    def _start_guid(self, attrsD):
1176        self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1177        self.push('id', 1)
1178
1179    def _end_guid(self):
1180        value = self.pop('id')
1181        self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1182        if self.guidislink:
1183            # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1184            # and only if the item doesn't already have a link element
1185            self._save('link', value)
1186
1187    def _start_title(self, attrsD):
1188        self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1189    _start_dc_title = _start_title
1190    _start_media_title = _start_title
1191
1192    def _end_title(self):
1193        value = self.popContent('title')
1194        context = self._getContext()
1195        if self.intextinput:
1196            context['textinput']['title'] = value
1197        elif self.inimage:
1198            context['image']['title'] = value
1199    _end_dc_title = _end_title
1200    _end_media_title = _end_title
1201
1202    def _start_description(self, attrsD):
1203        context = self._getContext()
1204        if context.has_key('summary'):
1205            self._summaryKey = 'content'
1206            self._start_content(attrsD)
1207        else:
1208            self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
1209
1210    def _start_abstract(self, attrsD):
1211        self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1212
1213    def _end_description(self):
1214        if self._summaryKey == 'content':
1215            self._end_content()
1216        else:
1217            value = self.popContent('description')
1218            context = self._getContext()
1219            if self.intextinput:
1220                context['textinput']['description'] = value
1221            elif self.inimage:
1222                context['image']['description'] = value
1223        self._summaryKey = None
1224    _end_abstract = _end_description
1225
1226    def _start_info(self, attrsD):
1227        self.pushContent('info', attrsD, 'text/plain', 1)
1228    _start_feedburner_browserfriendly = _start_info
1229
1230    def _end_info(self):
1231        self.popContent('info')
1232    _end_feedburner_browserfriendly = _end_info
1233
1234    def _start_generator(self, attrsD):
1235        if attrsD:
1236            attrsD = self._itsAnHrefDamnIt(attrsD)
1237            if attrsD.has_key('href'):
1238                attrsD['href'] = self.resolveURI(attrsD['href'])
1239        self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1240        self.push('generator', 1)
1241
1242    def _end_generator(self):
1243        value = self.pop('generator')
1244        context = self._getContext()
1245        if context.has_key('generator_detail'):
1246            context['generator_detail']['name'] = value
1247           
1248    def _start_admin_generatoragent(self, attrsD):
1249        self.push('generator', 1)
1250        value = self._getAttribute(attrsD, 'rdf:resource')
1251        if value:
1252            self.elementstack[-1][2].append(value)
1253        self.pop('generator')
1254        self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1255
1256    def _start_admin_errorreportsto(self, attrsD):
1257        self.push('errorreportsto', 1)
1258        value = self._getAttribute(attrsD, 'rdf:resource')
1259        if value:
1260            self.elementstack[-1][2].append(value)
1261        self.pop('errorreportsto')
1262       
1263    def _start_summary(self, attrsD):
1264        context = self._getContext()
1265        if context.has_key('summary'):
1266            self._summaryKey = 'content'
1267            self._start_content(attrsD)
1268        else:
1269            self._summaryKey = 'summary'
1270            self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
1271    _start_itunes_summary = _start_summary
1272
1273    def _end_summary(self):
1274        if self._summaryKey == 'content':
1275            self._end_content()
1276        else:
1277            self.popContent(self._summaryKey or 'summary')
1278        self._summaryKey = None
1279    _end_itunes_summary = _end_summary
1280       
1281    def _start_enclosure(self, attrsD):
1282        attrsD = self._itsAnHrefDamnIt(attrsD)
1283        self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))
1284        href = attrsD.get('href')
1285        if href:
1286            context = self._getContext()
1287            if not context.get('id'):
1288                context['id'] = href
1289           
1290    def _start_source(self, attrsD):
1291        self.insource = 1
1292
1293    def _end_source(self):
1294        self.insource = 0
1295        self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1296        self.sourcedata.clear()
1297
1298    def _start_content(self, attrsD):
1299        self.pushContent('content', attrsD, 'text/plain', 1)
1300        src = attrsD.get('src')
1301        if src:
1302            self.contentparams['src'] = src
1303        self.push('content', 1)
1304
1305    def _start_prodlink(self, attrsD):
1306        self.pushContent('content', attrsD, 'text/html', 1)
1307
1308    def _start_body(self, attrsD):
1309        self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
1310    _start_xhtml_body = _start_body
1311
1312    def _start_content_encoded(self, attrsD):
1313        self.pushContent('content', attrsD, 'text/html', 1)
1314    _start_fullitem = _start_content_encoded
1315
1316    def _end_content(self):
1317        copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
1318        value = self.popContent('content')
1319        if copyToDescription:
1320            self._save('description', value)
1321    _end_body = _end_content
1322    _end_xhtml_body = _end_content
1323    _end_content_encoded = _end_content
1324    _end_fullitem = _end_content
1325    _end_prodlink = _end_content
1326
1327    def _start_itunes_image(self, attrsD):
1328        self.push('itunes_image', 0)
1329        self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1330    _start_itunes_link = _start_itunes_image
1331       
1332    def _end_itunes_block(self):
1333        value = self.pop('itunes_block', 0)
1334        self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1335
1336    def _end_itunes_explicit(self):
1337        value = self.pop('itunes_explicit', 0)
1338        self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
1339
1340if _XML_AVAILABLE:
1341    class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1342        def __init__(self, baseuri, baselang, encoding):
1343            if _debug: sys.stderr.write('trying StrictFeedParser\n')
1344            xml.sax.handler.ContentHandler.__init__(self)
1345            _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1346            self.bozo = 0
1347            self.exc = None
1348       
1349        def startPrefixMapping(self, prefix, uri):
1350            self.trackNamespace(prefix, uri)
1351       
1352        def startElementNS(self, name, qname, attrs):
1353            namespace, localname = name
1354            lowernamespace = str(namespace or '').lower()
1355            if lowernamespace.find('backend.userland.com/rss') <> -1:
1356                # match any backend.userland.com namespace
1357                namespace = 'http://backend.userland.com/rss'
1358                lowernamespace = namespace
1359            if qname and qname.find(':') > 0:
1360                givenprefix = qname.split(':')[0]
1361            else:
1362                givenprefix = None
1363            prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1364            if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1365                    raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1366            if prefix:
1367                localname = prefix + ':' + localname
1368            localname = str(localname).lower()
1369            if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
1370
1371            # qname implementation is horribly broken in Python 2.1 (it
1372            # doesn't report any), and slightly broken in Python 2.2 (it
1373            # doesn't report the xml: namespace). So we match up namespaces
1374            # with a known list first, and then possibly override them with
1375            # the qnames the SAX parser gives us (if indeed it gives us any
1376            # at all).  Thanks to MatejC for helping me test this and
1377            # tirelessly telling me that it didn't work yet.
1378            attrsD = {}
1379            for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1380                lowernamespace = (namespace or '').lower()
1381                prefix = self._matchnamespaces.get(lowernamespace, '')
1382                if prefix:
1383                    attrlocalname = prefix + ':' + attrlocalname
1384                attrsD[str(attrlocalname).lower()] = attrvalue
1385            for qname in attrs.getQNames():
1386                attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1387            self.unknown_starttag(localname, attrsD.items())
1388
1389        def characters(self, text):
1390            self.handle_data(text)
1391
1392        def endElementNS(self, name, qname):
1393            namespace, localname = name
1394            lowernamespace = str(namespace or '').lower()
1395            if qname and qname.find(':') > 0:
1396                givenprefix = qname.split(':')[0]
1397            else:
1398                givenprefix = ''
1399            prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1400            if prefix:
1401                localname = prefix + ':' + localname
1402            localname = str(localname).lower()
1403            self.unknown_endtag(localname)
1404
1405        def error(self, exc):
1406            self.bozo = 1
1407            self.exc = exc
1408           
1409        def fatalError(self, exc):
1410            self.error(exc)
1411            raise exc
1412
1413class _BaseHTMLProcessor(sgmllib.SGMLParser):
1414    elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1415      'img', 'input', 'isindex', 'link', 'meta', 'param']
1416   
1417    def __init__(self, encoding):
1418        self.encoding = encoding
1419        if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1420        sgmllib.SGMLParser.__init__(self)
1421       
1422    def reset(self):
1423        self.pieces = []
1424        sgmllib.SGMLParser.reset(self)
1425
1426    def _shorttag_replace(self, match):
1427        tag = match.group(1)
1428        if tag in self.elements_no_end_tag:
1429            return '<' + tag + ' />'
1430        else:
1431            return '<' + tag + '></' + tag + '>'
1432       
1433    def feed(self, data):
1434        data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
1435        #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1436        data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) 
1437        data = data.replace('&#39;', "'")
1438        data = data.replace('&#34;', '"')
1439        if self.encoding and type(data) == type(u''):
1440            data = data.encode(self.encoding)
1441        sgmllib.SGMLParser.feed(self, data)
1442
1443    def normalize_attrs(self, attrs):
1444        # utility method to be called by descendants
1445        attrs = [(k.lower(), v) for k, v in attrs]
1446        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1447        return attrs
1448
1449    def unknown_starttag(self, tag, attrs):
1450        # called for each start tag
1451        # attrs is a list of (attr, value) tuples
1452        # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1453        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1454        uattrs = []
1455        # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1456        for key, value in attrs:
1457            if type(value) != type(u''):
1458                value = unicode(value, self.encoding)
1459            uattrs.append((unicode(key, self.encoding), value))
1460        strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
1461        if tag in self.elements_no_end_tag:
1462            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1463        else:
1464            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1465
1466    def unknown_endtag(self, tag):
1467        # called for each end tag, e.g. for </pre>, tag will be 'pre'
1468        # Reconstruct the original end tag.
1469        if tag not in self.elements_no_end_tag:
1470            self.pieces.append("</%(tag)s>" % locals())
1471
1472    def handle_charref(self, ref):
1473        # called for each character reference, e.g. for '&#160;', ref will be '160'
1474        # Reconstruct the original character reference.
1475        self.pieces.append('&#%(ref)s;' % locals())
1476       
1477    def handle_entityref(self, ref):
1478        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
1479        # Reconstruct the original entity reference.
1480        self.pieces.append('&%(ref)s;' % locals())
1481
1482    def handle_data(self, text):
1483        # called for each block of plain text, i.e. outside of any tag and
1484        # not containing any character or entity references
1485        # Store the original text verbatim.
1486        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
1487        self.pieces.append(text)
1488       
1489    def handle_comment(self, text):
1490        # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1491        # Reconstruct the original comment.
1492        self.pieces.append('<!--%(text)s-->' % locals())
1493       
1494    def handle_pi(self, text):
1495        # called for each processing instruction, e.g. <?instruction>
1496        # Reconstruct original processing instruction.
1497        self.pieces.append('<?%(text)s>' % locals())
1498
1499    def handle_decl(self, text):
1500        # called for the DOCTYPE, if present, e.g.
1501        # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1502        #     "http://www.w3.org/TR/html4/loose.dtd">
1503        # Reconstruct original DOCTYPE
1504        self.pieces.append('<!%(text)s>' % locals())
1505       
1506    _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1507    def _scan_name(self, i, declstartpos):
1508        rawdata = self.rawdata
1509        n = len(rawdata)
1510        if i == n:
1511            return None, -1
1512        m = self._new_declname_match(rawdata, i)
1513        if m:
1514            s = m.group()
1515            name = s.strip()
1516            if (i + len(s)) == n:
1517                return None, -1  # end of buffer
1518            return name.lower(), m.end()
1519        else:
1520            self.handle_data(rawdata)
1521#            self.updatepos(declstartpos, i)
1522            return None, -1
1523
1524    def output(self):
1525        '''Return processed HTML as a single string'''
1526        return ''.join([str(p) for p in self.pieces])
1527
1528class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1529    def __init__(self, baseuri, baselang, encoding):
1530        sgmllib.SGMLParser.__init__(self)
1531        _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1532
1533    def decodeEntities(self, element, data):
1534        data = data.replace('&#60;', '&lt;')
1535        data = data.replace('&#x3c;', '&lt;')
1536        data = data.replace('&#62;', '&gt;')
1537        data = data.replace('&#x3e;', '&gt;')
1538        data = data.replace('&#38;', '&amp;')
1539        data = data.replace('&#x26;', '&amp;')
1540        data = data.replace('&#34;', '&quot;')
1541        data = data.replace('&#x22;', '&quot;')
1542        data = data.replace('&#39;', '&apos;')
1543        data = data.replace('&#x27;', '&apos;')
1544        if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
1545            data = data.replace('&lt;', '<')
1546            data = data.replace('&gt;', '>')
1547            data = data.replace('&amp;', '&')
1548            data = data.replace('&quot;', '"')
1549            data = data.replace('&apos;', "'")
1550        return data
1551       
1552class _RelativeURIResolver(_BaseHTMLProcessor):
1553    relative_uris = [('a', 'href'),
1554                     ('applet', 'codebase'),
1555                     ('area', 'href'),
1556                     ('blockquote', 'cite'),
1557                     ('body', 'background'),
1558                     ('del', 'cite'),
1559                     ('form', 'action'),
1560                     ('frame', 'longdesc'),
1561                     ('frame', 'src'),
1562                     ('iframe', 'longdesc'),
1563                     ('iframe', 'src'),
1564                     ('head', 'profile'),
1565                     ('img', 'longdesc'),
1566                     ('img', 'src'),
1567                     ('img', 'usemap'),
1568                     ('input', 'src'),
1569                     ('input', 'usemap'),
1570                     ('ins', 'cite'),
1571                     ('link', 'href'),
1572                     ('object', 'classid'),
1573                     ('object', 'codebase'),
1574                     ('object', 'data'),
1575                     ('object', 'usemap'),
1576                     ('q', 'cite'),
1577                     ('script', 'src')]
1578
1579    def __init__(self, baseuri, encoding):
1580        _BaseHTMLProcessor.__init__(self, encoding)
1581        self.baseuri = baseuri
1582
1583    def resolveURI(self, uri):
1584        return _urljoin(self.baseuri, uri)
1585   
1586    def unknown_starttag(self, tag, attrs):
1587        attrs = self.normalize_attrs(attrs)
1588        attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
1589        _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1590       
1591def _resolveRelativeURIs(htmlSource, baseURI, encoding):
1592    if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
1593    p = _RelativeURIResolver(baseURI, encoding)
1594    p.feed(htmlSource)
1595    return p.output()
1596
1597class _HTMLSanitizer(_BaseHTMLProcessor):
1598    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
1599      'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
1600      'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
1601      'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
1602      'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
1603      'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
1604      'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
1605      'thead', 'tr', 'tt', 'u', 'ul', 'var']
1606
1607    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
1608      'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
1609      'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
1610      'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
1611      'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
1612      'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
1613      'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
1614      'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
1615      'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
1616      'usemap', 'valign', 'value', 'vspace', 'width']
1617
1618    unacceptable_elements_with_end_tag = ['script', 'applet']
1619
1620    def reset(self):
1621        _BaseHTMLProcessor.reset(self)
1622        self.unacceptablestack = 0
1623       
1624    def unknown_starttag(self, tag, attrs):
1625        if not tag in self.acceptable_elements:
1626            if tag in self.unacceptable_elements_with_end_tag:
1627                self.unacceptablestack += 1
1628            return
1629        attrs = self.normalize_attrs(attrs)
1630        attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
1631        _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1632       
1633    def unknown_endtag(self, tag):
1634        if not tag in self.acceptable_elements:
1635            if tag in self.unacceptable_elements_with_end_tag:
1636                self.unacceptablestack -= 1
1637            return
1638        _BaseHTMLProcessor.unknown_endtag(self, tag)
1639
1640    def handle_pi(self, text):
1641        pass
1642
1643    def handle_decl(self, text):
1644        pass
1645
1646    def handle_data(self, text):
1647        if not self.unacceptablestack:
1648            _BaseHTMLProcessor.handle_data(self, text)
1649
1650def _sanitizeHTML(htmlSource, encoding):
1651    p = _HTMLSanitizer(encoding)
1652    p.feed(htmlSource)
1653    data = p.output()
1654    if TIDY_MARKUP:
1655        # loop through list of preferred Tidy interfaces looking for one that's installed,
1656        # then set up a common _tidy function to wrap the interface-specific API.
1657        _tidy = None
1658        for tidy_interface in PREFERRED_TIDY_INTERFACES:
1659            try:
1660                if tidy_interface == "uTidy":
1661                    from tidy import parseString as _utidy
1662                    def _tidy(data, **kwargs):
1663                        return str(_utidy(data, **kwargs))
1664                    break
1665                elif tidy_interface == "mxTidy":
1666                    from mx.Tidy import Tidy as _mxtidy
1667                    def _tidy(data, **kwargs):
1668                        nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
1669                        return data
1670                    break
1671            except:
1672                pass
1673        if _tidy:
1674            utf8 = type(data) == type(u'')
1675            if utf8:
1676                data = data.encode('utf-8')
1677            data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
1678            if utf8:
1679                data = unicode(data, 'utf-8')
1680            if data.count('<body'):
1681                data = data.split('<body', 1)[1]
1682                if data.count('>'):
1683                    data = data.split('>', 1)[1]
1684            if data.count('</body'):
1685                data = data.split('</body', 1)[0]
1686    data = data.strip().replace('\r\n', '\n')
1687    return data
1688
1689class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
1690    def http_error_default(self, req, fp, code, msg, headers):
1691        if ((code / 100) == 3) and (code != 304):
1692            return self.http_error_302(req, fp, code, msg, headers)
1693        infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1694        infourl.status = code
1695        return infourl
1696
1697    def http_error_302(self, req, fp, code, msg, headers):
1698        if headers.dict.has_key('location'):
1699            infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
1700        else:
1701            infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1702        if not hasattr(infourl, 'status'):
1703            infourl.status = code
1704        return infourl
1705
1706    def http_error_301(self, req, fp, code, msg, headers):
1707        if headers.dict.has_key('location'):
1708            infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
1709        else:
1710            infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1711        if not hasattr(infourl, 'status'):
1712            infourl.status = code
1713        return infourl
1714
1715    http_error_300 = http_error_302
1716    http_error_303 = http_error_302
1717    http_error_307 = http_error_302
1718       
1719    def http_error_401(self, req, fp, code, msg, headers):
1720        # Check if
1721        # - server requires digest auth, AND
1722        # - we tried (unsuccessfully) with basic auth, AND
1723        # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
1724        # If all conditions hold, parse authentication information
1725        # out of the Authorization header we sent the first time
1726        # (for the username and password) and the WWW-Authenticate
1727        # header the server sent back (for the realm) and retry
1728        # the request with the appropriate digest auth headers instead.
1729        # This evil genius hack has been brought to you by Aaron Swartz.
1730        host = urlparse.urlparse(req.get_full_url())[1]
1731        try:
1732            assert sys.version.split()[0] >= '2.3.3'
1733            assert base64 != None
1734            user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
1735            realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
1736            self.add_password(realm, host, user, passw)
1737            retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
1738            self.reset_retry_count()
1739            return retry
1740        except:
1741            return self.http_error_default(req, fp, code, msg, headers)
1742
1743def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
1744    """URL, filename, or string --> stream
1745
1746    This function lets you define parsers that take any input source
1747    (URL, pathname to local or network file, or actual data as a string)
1748    and deal with it in a uniform manner.  Returned object is guaranteed
1749    to have all the basic stdio read methods (read, readline, readlines).
1750    Just .close() the object when you're done with it.
1751
1752    If the etag argument is supplied, it will be used as the value of an
1753    If-None-Match request header.
1754
1755    If the modified argument is supplied, it must be a tuple of 9 integers
1756    as returned by gmtime() in the standard Python time module. This MUST
1757    be in GMT (Greenwich Mean Time). The formatted date/time will be used
1758    as the value of an If-Modified-Since request header.
1759
1760    If the agent argument is supplied, it will be used as the value of a
1761    User-Agent request header.
1762
1763    If the referrer argument is supplied, it will be used as the value of a
1764    Referer[sic] request header.
1765
1766    If handlers is supplied, it is a list of handlers used to build a
1767    urllib2 opener.
1768    """
1769
1770    if hasattr(url_file_stream_or_string, 'read'):
1771        return url_file_stream_or_string
1772
1773    if url_file_stream_or_string == '-':
1774        return sys.stdin
1775
1776    if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
1777        if not agent:
1778            agent = USER_AGENT
1779        # test for inline user:password for basic auth
1780        auth = None
1781        if base64:
1782            urltype, rest = urllib.splittype(url_file_stream_or_string)
1783            realhost, rest = urllib.splithost(rest)
1784            if realhost:
1785                user_passwd, realhost = urllib.splituser(realhost)
1786                if user_passwd:
1787                    url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
1788                    auth = base64.encodestring(user_passwd).strip()
1789        # try to open with urllib2 (to use optional headers)
1790        request = urllib2.Request(url_file_stream_or_string)
1791        request.add_header('User-Agent', agent)
1792        if etag:
1793            request.add_header('If-None-Match', etag)
1794        if modified:
1795            # format into an RFC 1123-compliant timestamp. We can't use
1796            # time.strftime() since the %a and %b directives can be affected
1797            # by the current locale, but RFC 2616 states that dates must be
1798            # in English.
1799            short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
1800            months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
1801            request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
1802        if referrer:
1803            request.add_header('Referer', referrer)
1804        if gzip and zlib:
1805            request.add_header('Accept-encoding', 'gzip, deflate')
1806        elif gzip:
1807            request.add_header('Accept-encoding', 'gzip')
1808        elif zlib:
1809            request.add_header('Accept-encoding', 'deflate')
1810        else:
1811            request.add_header('Accept-encoding', '')
1812        if auth:
1813            request.add_header('Authorization', 'Basic %s' % auth)
1814        if ACCEPT_HEADER:
1815            request.add_header('Accept', ACCEPT_HEADER)
1816        request.add_header('A-IM', 'feed') # RFC 3229 support
1817        opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
1818        opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
1819        try:
1820            return opener.open(request)
1821        finally:
1822            opener.close() # JohnD
1823   
1824    # try to open with native open function (if url_file_stream_or_string is a filename)
1825    try:
1826        return open(url_file_stream_or_string)
1827    except:
1828        pass
1829
1830    # treat url_file_stream_or_string as string
1831    return _StringIO(str(url_file_stream_or_string))
1832
1833_date_handlers = []
1834def registerDateHandler(func):
1835    '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
1836    _date_handlers.insert(0, func)
1837   
1838# ISO-8601 date parsing routines written by Fazal Majid.
1839# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
1840# parser is beyond the scope of feedparser and would be a worthwhile addition
1841# to the Python library.
1842# A single regular expression cannot parse ISO 8601 date formats into groups
1843# as the standard is highly irregular (for instance is 030104 2003-01-04 or
1844# 0301-04-01), so we use templates instead.
1845# Please note the order in templates is significant because we need a
1846# greedy match.
1847_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
1848                'YY-?MM-?DD', 'YY-?OOO', 'YYYY', 
1849                '-YY-?MM', '-OOO', '-YY',
1850                '--MM-?DD', '--MM',
1851                '---DD',
1852                'CC', '']
1853_iso8601_re = [
1854    tmpl.replace(
1855    'YYYY', r'(?P<year>\d{4})').replace(
1856    'YY', r'(?P<year>\d\d)').replace(
1857    'MM', r'(?P<month>[01]\d)').replace(
1858    'DD', r'(?P<day>[0123]\d)').replace(
1859    'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
1860    'CC', r'(?P<century>\d\d$)')
1861    + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
1862    + r'(:(?P<second>\d{2}))?'
1863    + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
1864    for tmpl in _iso8601_tmpl]
1865del tmpl
1866_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
1867del regex
1868def _parse_date_iso8601(dateString):
1869    '''Parse a variety of ISO-8601-compatible formats like 20040105'''
1870    m = None
1871    for _iso8601_match in _iso8601_matches:
1872        m = _iso8601_match(dateString)
1873        if m: break
1874    if not m: return
1875    if m.span() == (0, 0): return
1876    params = m.groupdict()
1877    ordinal = params.get('ordinal', 0)
1878    if ordinal:
1879        ordinal = int(ordinal)
1880    else:
1881        ordinal = 0
1882    year = params.get('year', '--')
1883    if not year or year == '--':
1884        year = time.gmtime()[0]
1885    elif len(year) == 2:
1886        # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
1887        year = 100 * int(time.gmtime()[0] / 100) + int(year)
1888    else:
1889        year = int(year)
1890    month = params.get('month', '-')
1891    if not month or month == '-':
1892        # ordinals are NOT normalized by mktime, we simulate them
1893        # by setting month=1, day=ordinal
1894        if ordinal:
1895            month = 1
1896        else:
1897            month = time.gmtime()[1]
1898    month = int(month)
1899    day = params.get('day', 0)
1900    if not day:
1901        # see above
1902        if ordinal:
1903            day = ordinal
1904        elif params.get('century', 0) or \
1905                 params.get('year', 0) or params.get('month', 0):
1906            day = 1
1907        else:
1908            day = time.gmtime()[2]
1909    else:
1910        day = int(day)
1911    # special case of the century - is the first year of the 21st century
1912    # 2000 or 2001 ? The debate goes on...
1913    if 'century' in params.keys():
1914        year = (int(params['century']) - 1) * 100 + 1
1915    # in ISO 8601 most fields are optional
1916    for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
1917        if not params.get(field, None):
1918            params[field] = 0
1919    hour = int(params.get('hour', 0))
1920    minute = int(params.get('minute', 0))
1921    second = int(params.get('second', 0))
1922    # weekday is normalized by mktime(), we can ignore it
1923    weekday = 0
1924    # daylight savings is complex, but not needed for feedparser's purposes
1925    # as time zones, if specified, include mention of whether it is active
1926    # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
1927    # and most implementations have DST bugs
1928    daylight_savings_flag = 0
1929    tm = [year, month, day, hour, minute, second, weekday,
1930          ordinal, daylight_savings_flag]
1931    # ISO 8601 time zone adjustments
1932    tz = params.get('tz')
1933    if tz and tz != 'Z':
1934        if tz[0] == '-':
1935            tm[3] += int(params.get('tzhour', 0))
1936            tm[4] += int(params.get('tzmin', 0))
1937        elif tz[0] == '+':
1938            tm[3] -= int(params.get('tzhour', 0))
1939            tm[4] -= int(params.get('tzmin', 0))
1940        else:
1941            return None
1942    # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
1943    # which is guaranteed to normalize d/m/y/h/m/s.
1944    # Many implementations have bugs, but we'll pretend they don't.
1945    return time.localtime(time.mktime(tm))
1946registerDateHandler(_parse_date_iso8601)
1947   
1948# 8-bit date handling routines written by ytrewq1.
1949_korean_year  = u'\ub144' # b3e2 in euc-kr
1950_korean_month = u'\uc6d4' # bff9 in euc-kr
1951_korean_day   = u'\uc77c' # c0cf in euc-kr
1952_korean_am    = u'\uc624\uc804' # bfc0 c0fc in euc-kr
1953_korean_pm    = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
1954
1955_korean_onblog_date_re = \
1956    re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
1957               (_korean_year, _korean_month, _korean_day))
1958_korean_nate_date_re = \
1959    re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
1960               (_korean_am, _korean_pm))
1961def _parse_date_onblog(dateString):
1962    '''Parse a string according to the OnBlog 8-bit date format'''
1963    m = _korean_onblog_date_re.match(dateString)
1964    if not m: return
1965    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1966                {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1967                 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
1968                 'zonediff': '+09:00'}
1969    if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
1970    return _parse_date_w3dtf(w3dtfdate)
1971registerDateHandler(_parse_date_onblog)
1972
1973def _parse_date_nate(dateString):
1974    '''Parse a string according to the Nate 8-bit date format'''
1975    m = _korean_nate_date_re.match(dateString)
1976    if not m: return
1977    hour = int(m.group(5))
1978    ampm = m.group(4)
1979    if (ampm == _korean_pm):
1980        hour += 12
1981    hour = str(hour)
1982    if len(hour) == 1:
1983        hour = '0' + hour
1984    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1985                {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1986                 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
1987                 'zonediff': '+09:00'}
1988    if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
1989    return _parse_date_w3dtf(w3dtfdate)
1990registerDateHandler(_parse_date_nate)
1991
1992_mssql_date_re = \
1993    re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
1994def _parse_date_mssql(dateString):
1995    '''Parse a string according to the MS SQL date format'''
1996    m = _mssql_date_re.match(dateString)
1997    if not m: return
1998    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1999                {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2000                 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
2001                 'zonediff': '+09:00'}
2002    if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
2003    return _parse_date_w3dtf(w3dtfdate)
2004registerDateHandler(_parse_date_mssql)
2005
2006# Unicode strings for Greek date strings
2007_greek_months = \
2008  { \
2009   u'\u0399\u03b1\u03bd': u'Jan',       # c9e1ed in iso-8859-7
2010   u'\u03a6\u03b5\u03b2': u'Feb',       # d6e5e2 in iso-8859-7
2011   u'\u039c\u03ac\u03ce': u'Mar',       # ccdcfe in iso-8859-7
2012   u'\u039c\u03b1\u03ce': u'Mar',       # cce1fe in iso-8859-7
2013   u'\u0391\u03c0\u03c1': u'Apr',       # c1f0f1 in iso-8859-7
2014   u'\u039c\u03ac\u03b9': u'May',       # ccdce9 in iso-8859-7
2015   u'\u039c\u03b1\u03ca': u'May',       # cce1fa in iso-8859-7
2016   u'\u039c\u03b1\u03b9': u'May',       # cce1e9 in iso-8859-7
2017   u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
2018   u'\u0399\u03bf\u03bd': u'Jun',       # c9efed in iso-8859-7
2019   u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
2020   u'\u0399\u03bf\u03bb': u'Jul',       # c9f9eb in iso-8859-7
2021   u'\u0391\u03cd\u03b3': u'Aug',       # c1fde3 in iso-8859-7
2022   u'\u0391\u03c5\u03b3': u'Aug',       # c1f5e3 in iso-8859-7
2023   u'\u03a3\u03b5\u03c0': u'Sep',       # d3e5f0 in iso-8859-7
2024   u'\u039f\u03ba\u03c4': u'Oct',       # cfeaf4 in iso-8859-7
2025   u'\u039d\u03bf\u03ad': u'Nov',       # cdefdd in iso-8859-7
2026   u'\u039d\u03bf\u03b5': u'Nov',       # cdefe5 in iso-8859-7
2027   u'\u0394\u03b5\u03ba': u'Dec',       # c4e5ea in iso-8859-7
2028  }
2029
2030_greek_wdays = \
2031  { \
2032   u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
2033   u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
2034   u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
2035   u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
2036   u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
2037   u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
2038   u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7   
2039  }
2040
2041_greek_date_format_re = \
2042    re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
2043
2044def _parse_date_greek(dateString):
2045    '''Parse a string according to a Greek 8-bit date format.'''
2046    m = _greek_date_format_re.match(dateString)
2047    if not m: return
2048    try:
2049        wday = _greek_wdays[m.group(1)]
2050        month = _greek_months[m.group(3)]
2051    except:
2052        return
2053    rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
2054                 {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
2055                  'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
2056                  'zonediff': m.group(8)}
2057    if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
2058    return _parse_date_rfc822(rfc822date)
2059registerDateHandler(_parse_date_greek)
2060
2061# Unicode strings for Hungarian date strings
2062_hungarian_months = \
2063  { \
2064    u'janu\u00e1r':   u'01',  # e1 in iso-8859-2
2065    u'febru\u00e1ri': u'02',  # e1 in iso-8859-2
2066    u'm\u00e1rcius':  u'03',  # e1 in iso-8859-2
2067    u'\u00e1prilis':  u'04',  # e1 in iso-8859-2
2068    u'm\u00e1ujus':   u'05',  # e1 in iso-8859-2
2069    u'j\u00fanius':   u'06',  # fa in iso-8859-2
2070    u'j\u00falius':   u'07',  # fa in iso-8859-2
2071    u'augusztus':     u'08',
2072    u'szeptember':    u'09',
2073    u'okt\u00f3ber':  u'10',  # f3 in iso-8859-2
2074    u'november':      u'11',
2075    u'december':      u'12',
2076  }
2077
2078_hungarian_date_format_re = \
2079  re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
2080
2081def _parse_date_hungarian(dateString):
2082    '''Parse a string according to a Hungarian 8-bit date format.'''
2083    m = _hungarian_date_format_re.match(dateString)
2084    if not m: return
2085    try:
2086        month = _hungarian_months[m.group(2)]
2087        day = m.group(3)
2088        if len(day) == 1:
2089            day = '0' + day
2090        hour = m.group(4)
2091        if len(hour) == 1:
2092            hour = '0' + hour
2093    except:
2094        return
2095    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
2096                {'year': m.group(1), 'month': month, 'day': day,\
2097                 'hour': hour, 'minute': m.group(5),\
2098                 'zonediff': m.group(6)}
2099    if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
2100    return _parse_date_w3dtf(w3dtfdate)
2101registerDateHandler(_parse_date_hungarian)
2102
2103# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
2104# Drake and licensed under the Python license.  Removed all range checking
2105# for month, day, hour, minute, and second, since mktime will normalize
2106# these later
2107def _parse_date_w3dtf(dateString):
2108    def __extract_date(m):
2109        year = int(m.group('year'))
2110        if year < 100:
2111            year = 100 * int(time.gmtime()[0] / 100) + int(year)
2112        if year < 1000:
2113            return 0, 0, 0
2114        julian = m.group('julian')
2115        if julian:
2116            julian = int(julian)
2117            month = julian / 30 + 1
2118            day = julian % 30 + 1
2119            jday = None
2120            while jday != julian:
2121                t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
2122                jday = time.gmtime(t)[-2]
2123                diff = abs(jday - julian)
2124                if jday > julian:
2125                    if diff < day:
2126                        day = day - diff
2127                    else:
2128                        month = month - 1
2129                        day = 31
2130                elif jday < julian:
2131                    if day + diff < 28:
2132                       day = day + diff
2133                    else:
2134                        month = month + 1
2135            return year, month, day
2136        month = m.group('month')
2137        day = 1
2138        if month is None:
2139            month = 1
2140        else:
2141            month = int(month)
2142            day = m.group('day')
2143            if day:
2144                day = int(day)
2145            else:
2146                day = 1
2147        return year, month, day
2148
2149    def __extract_time(m):
2150        if not m:
2151            return 0, 0, 0
2152        hours = m.group('hours')
2153        if not hours:
2154            return 0, 0, 0
2155        hours = int(hours)
2156        minutes = int(m.group('minutes'))
2157        seconds = m.group('seconds')
2158        if seconds:
2159            seconds = int(seconds)
2160        else:
2161            seconds = 0
2162        return hours, minutes, seconds
2163
2164    def __extract_tzd(m):
2165        '''Return the Time Zone Designator as an offset in seconds from UTC.'''
2166        if not m:
2167            return 0
2168        tzd = m.group('tzd')
2169        if not tzd:
2170            return 0
2171        if tzd == 'Z':
2172            return 0
2173        hours = int(m.group('tzdhours'))
2174        minutes = m.group('tzdminutes')
2175        if minutes:
2176            minutes = int(minutes)
2177        else:
2178            minutes = 0
2179        offset = (hours*60 + minutes) * 60
2180        if tzd[0] == '+':
2181            return -offset
2182        return offset
2183
2184    __date_re = ('(?P<year>\d\d\d\d)'
2185                 '(?:(?P<dsep>-|)'
2186                 '(?:(?P<julian>\d\d\d)'
2187                 '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')
2188    __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
2189    __tzd_rx = re.compile(__tzd_re)
2190    __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
2191                 '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'
2192                 + __tzd_re)
2193    __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
2194    __datetime_rx = re.compile(__datetime_re)
2195    m = __datetime_rx.match(dateString)
2196    if (m is None) or (m.group() != dateString): return
2197    gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
2198    if gmt[0] == 0: return
2199    return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
2200registerDateHandler(_parse_date_w3dtf)
2201
2202def _parse_date_rfc822(dateString):
2203    '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
2204    data = dateString.split()
2205    if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
2206        del data[0]
2207    if len(data) == 4:
2208        s = data[3]
2209        i = s.find('+')
2210        if i > 0:
2211            data[3:] = [s[:i], s[i+1:]]
2212        else:
2213            data.append('')
2214        dateString = " ".join(data)
2215    if len(data) < 5:
2216        dateString += ' 00:00:00 GMT'
2217    tm = rfc822.parsedate_tz(dateString)
2218    if tm:
2219        return time.gmtime(rfc822.mktime_tz(tm))
2220# rfc822.py defines several time zones, but we define some extra ones.
2221# 'ET' is equivalent to 'EST', etc.
2222_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
2223rfc822._timezones.update(_additional_timezones)
2224registerDateHandler(_parse_date_rfc822)   
2225
2226def _parse_date(dateString):
2227    '''Parses a variety of date formats into a 9-tuple in GMT'''
2228    for handler in _date_handlers:
2229        try:
2230            date9tuple = handler(dateString)
2231            if not date9tuple: continue
2232            if len(date9tuple) != 9:
2233                if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
2234                raise ValueError
2235            map(int, date9tuple)
2236            return date9tuple
2237        except Exception, e:
2238            if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
2239            pass
2240    return None
2241
2242def _getCharacterEncoding(http_headers, xml_data):
2243    '''Get the character encoding of the XML document
2244
2245    http_headers is a dictionary
2246    xml_data is a raw string (not Unicode)
2247   
2248    This is so much trickier than it sounds, it's not even funny.
2249    According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
2250    is application/xml, application/*+xml,
2251    application/xml-external-parsed-entity, or application/xml-dtd,
2252    the encoding given in the charset parameter of the HTTP Content-Type
2253    takes precedence over the encoding given in the XML prefix within the
2254    document, and defaults to 'utf-8' if neither are specified.  But, if
2255    the HTTP Content-Type is text/xml, text/*+xml, or
2256    text/xml-external-parsed-entity, the encoding given in the XML prefix
2257    within the document is ALWAYS IGNORED and only the encoding given in
2258    the charset parameter of the HTTP Content-Type header should be
2259    respected, and it defaults to 'us-ascii' if not specified.
2260
2261    Furthermore, discussion on the atom-syntax mailing list with the
2262    author of RFC 3023 leads me to the conclusion that any document
2263    served with a Content-Type of text/* and no charset parameter
2264    must be treated as us-ascii.  (We now do this.)  And also that it
2265    must always be flagged as non-well-formed.  (We now do this too.)
2266   
2267    If Content-Type is unspecified (input was local file or non-HTTP source)
2268    or unrecognized (server just got it totally wrong), then go by the
2269    encoding given in the XML prefix of the document and default to
2270    'iso-8859-1' as per the HTTP specification (RFC 2616).
2271   
2272    Then, assuming we didn't find a character encoding in the HTTP headers
2273    (and the HTTP Content-type allowed us to look in the body), we need
2274    to sniff the first few bytes of the XML data and try to determine
2275    whether the encoding is ASCII-compatible.  Section F of the XML
2276    specification shows the way here:
2277    http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2278
2279    If the sniffed encoding is not ASCII-compatible, we need to make it
2280    ASCII compatible so that we can sniff further into the XML declaration
2281    to find the encoding attribute, which will tell us the true encoding.
2282
2283    Of course, none of this guarantees that we will be able to parse the
2284    feed in the declared character encoding (assuming it was declared
2285    correctly, which many are not).  CJKCodecs and iconv_codec help a lot;
2286    you should definitely install them if you can.
2287    http://cjkpython.i18n.org/
2288    '''
2289
2290    def _parseHTTPContentType(content_type):
2291        '''takes HTTP Content-Type header and returns (content type, charset)
2292
2293        If no charset is specified, returns (content type, '')
2294        If no content type is specified, returns ('', '')
2295        Both return parameters are guaranteed to be lowercase strings
2296        '''
2297        content_type = content_type or ''
2298        content_type, params = cgi.parse_header(content_type)
2299        return content_type, params.get('charset', '').replace("'", '')
2300
2301    sniffed_xml_encoding = ''
2302    xml_encoding = ''
2303    true_encoding = ''
2304    http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
2305    # Must sniff for non-ASCII-compatible character encodings before
2306    # searching for XML declaration.  This heuristic is defined in
2307    # section F of the XML specification:
2308    # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2309    try:
2310        if xml_data[:4] == '\x4c\x6f\xa7\x94':
2311            # EBCDIC
2312            xml_data = _ebcdic_to_ascii(xml_data)
2313        elif xml_data[:4] == '\x00\x3c\x00\x3f':
2314            # UTF-16BE
2315            sniffed_xml_encoding = 'utf-16be'
2316            xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
2317        elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
2318            # UTF-16BE with BOM
2319            sniffed_xml_encoding = 'utf-16be'
2320            xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
2321        elif xml_data[:4] == '\x3c\x00\x3f\x00':
2322            # UTF-16LE
2323            sniffed_xml_encoding = 'utf-16le'
2324            xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
2325        elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
2326            # UTF-16LE with BOM
2327            sniffed_xml_encoding = 'utf-16le'
2328            xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
2329        elif xml_data[:4] == '\x00\x00\x00\x3c':
2330            # UTF-32BE
2331            sniffed_xml_encoding = 'utf-32be'
2332            xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
2333        elif xml_data[:4] == '\x3c\x00\x00\x00':
2334            # UTF-32LE
2335            sniffed_xml_encoding = 'utf-32le'
2336            xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
2337        elif xml_data[:4] == '\x00\x00\xfe\xff':
2338            # UTF-32BE with BOM
2339            sniffed_xml_encoding = 'utf-32be'
2340            xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
2341        elif xml_data[:4] == '\xff\xfe\x00\x00':
2342            # UTF-32LE with BOM
2343            sniffed_xml_encoding = 'utf-32le'
2344            xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
2345        elif xml_data[:3] == '\xef\xbb\xbf':
2346            # UTF-8 with BOM
2347            sniffed_xml_encoding = 'utf-8'
2348            xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
2349        else:
2350            # ASCII-compatible
2351            pass
2352        xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
2353    except:
2354        xml_encoding_match = None
2355    if xml_encoding_match:
2356        xml_encoding = xml_encoding_match.groups()[0].lower()
2357        if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
2358            xml_encoding = sniffed_xml_encoding
2359    acceptable_content_type = 0
2360    application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
2361    text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
2362    if (http_content_type in application_content_types) or \
2363       (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
2364        acceptable_content_type = 1
2365        true_encoding = http_encoding or xml_encoding or 'utf-8'
2366    elif (http_content_type in text_content_types) or \
2367         (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
2368        acceptable_content_type = 1
2369        true_encoding = http_encoding or 'us-ascii'
2370    elif http_content_type.startswith('text/'):
2371        true_encoding = http_encoding or 'us-ascii'
2372    elif http_headers and (not http_headers.has_key('content-type')):
2373        true_encoding = xml_encoding or 'iso-8859-1'
2374    else:
2375        true_encoding = xml_encoding or 'utf-8'
2376    return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
2377   
2378def _toUTF8(data, encoding):
2379    '''Changes an XML data stream on the fly to specify a new encoding
2380
2381    data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
2382    encoding is a string recognized by encodings.aliases
2383    '''
2384    if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
2385    # strip Byte Order Mark (if present)
2386    if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
2387        if _debug:
2388            sys.stderr.write('stripping BOM\n')
2389            if encoding != 'utf-16be':
2390                sys.stderr.write('trying utf-16be instead\n')
2391        encoding = 'utf-16be'
2392        data = data[2:]
2393    elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
2394        if _debug:
2395            sys.stderr.write('stripping BOM\n')
2396            if encoding != 'utf-16le':
2397                sys.stderr.write('trying utf-16le instead\n')
2398        encoding = 'utf-16le'
2399        data = data[2:]
2400    elif data[:3] == '\xef\xbb\xbf':
2401        if _debug:
2402            sys.stderr.write('stripping BOM\n')
2403            if encoding != 'utf-8':
2404                sys.stderr.write('trying utf-8 instead\n')
2405        encoding = 'utf-8'
2406        data = data[3:]
2407    elif data[:4] == '\x00\x00\xfe\xff':
2408        if _debug:
2409            sys.stderr.write('stripping BOM\n')
2410            if encoding != 'utf-32be':
2411                sys.stderr.write('trying utf-32be instead\n')
2412        encoding = 'utf-32be'
2413        data = data[4:]
2414    elif data[:4] == '\xff\xfe\x00\x00':
2415        if _debug:
2416            sys.stderr.write('stripping BOM\n')
2417            if encoding != 'utf-32le':
2418                sys.stderr.write('trying utf-32le instead\n')
2419        encoding = 'utf-32le'
2420        data = data[4:]
2421    newdata = unicode(data, encoding)
2422    if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
2423    declmatch = re.compile('^<\?xml[^>]*?>')
2424    newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
2425    if declmatch.search(newdata):
2426        newdata = declmatch.sub(newdecl, newdata)
2427    else:
2428        newdata = newdecl + u'\n' + newdata
2429    return newdata.encode('utf-8')
2430
2431def _stripDoctype(data):
2432    '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
2433
2434    rss_version may be 'rss091n' or None
2435    stripped_data is the same XML document, minus the DOCTYPE
2436    '''
2437    entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
2438    data = entity_pattern.sub('', data)
2439    doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
2440    doctype_results = doctype_pattern.findall(data)
2441    doctype = doctype_results and doctype_results[0] or ''
2442    if doctype.lower().count('netscape'):
2443        version = 'rss091n'
2444    else:
2445        version = None
2446    data = doctype_pattern.sub('', data)
2447    return version, data
2448   
2449def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
2450    '''Parse a feed from a URL, file, stream, or string'''
2451    result = FeedParserDict()
2452    result['feed'] = FeedParserDict()
2453    result['entries'] = []
2454    if _XML_AVAILABLE:
2455        result['bozo'] = 0
2456    if type(handlers) == types.InstanceType:
2457        handlers = [handlers]
2458    try:
2459        f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
2460        data = f.read()
2461    except Exception, e:
2462        result['bozo'] = 1
2463        result['bozo_exception'] = e
2464        data = ''
2465        f = None
2466
2467    # if feed is gzip-compressed, decompress it
2468    if f and data and hasattr(f, 'headers'):
2469        if gzip and f.headers.get('content-encoding', '') == 'gzip':
2470            try:
2471                data = gzip.GzipFile(fileobj=_StringIO(data)).read()
2472            except Exception, e:
2473                # Some feeds claim to be gzipped but they're not, so
2474                # we get garbage.  Ideally, we should re-request the
2475                # feed without the 'Accept-encoding: gzip' header,
2476                # but we don't.
2477                result['bozo'] = 1
2478                result['bozo_exception'] = e
2479                data = ''
2480        elif zlib and f.headers.get('content-encoding', '') == 'deflate':
2481            try:
2482                data = zlib.decompress(data, -zlib.MAX_WBITS)
2483            except Exception, e:
2484                result['bozo'] = 1
2485                result['bozo_exception'] = e
2486                data = ''
2487
2488    # save HTTP headers
2489    if hasattr(f, 'info'):
2490        info = f.info()
2491        result['etag'] = info.getheader('ETag')
2492        last_modified = info.getheader('Last-Modified')
2493        if last_modified:
2494            result['modified'] = _parse_date(last_modified)
2495    if hasattr(f, 'url'):
2496        result['href'] = f.url
2497        result['status'] = 200
2498    if hasattr(f, 'status'):
2499        result['status'] = f.status
2500    if hasattr(f, 'headers'):
2501        result['headers'] = f.headers.dict
2502    if hasattr(f, 'close'):
2503        f.close()
2504
2505    # there are four encodings to keep track of:
2506    # - http_encoding is the encoding declared in the Content-Type HTTP header
2507    # - xml_encoding is the encoding declared in the <?xml declaration
2508    # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
2509    # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
2510    http_headers = result.get('headers', {})
2511    result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
2512        _getCharacterEncoding(http_headers, data)
2513    if http_headers and (not acceptable_content_type):
2514        if http_headers.has_key('content-type'):
2515            bozo_message = '%s is not an XML media type' % http_headers['content-type']
2516        else:
2517            bozo_message = 'no Content-type specified'
2518        result['bozo'] = 1
2519        result['bozo_exception'] = NonXMLContentType(bozo_message)
2520       
2521    result['version'], data = _stripDoctype(data)
2522
2523    baseuri = http_headers.get('content-location', result.get('href'))
2524    baselang = http_headers.get('content-language', None)
2525
2526    # if server sent 304, we're done
2527    if result.get('status', 0) == 304:
2528        result['version'] = ''
2529        result['debug_message'] = 'The feed has not changed since you last checked, ' + \
2530            'so the server sent no data.  This is a feature, not a bug!'
2531        return result
2532
2533    # if there was a problem downloading, we're done
2534    if not data:
2535        return result
2536
2537    # determine character encoding
2538    use_strict_parser = 0
2539    known_encoding = 0
2540    tried_encodings = []
2541    # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
2542    for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
2543        if not proposed_encoding: continue
2544        if proposed_encoding in tried_encodings: continue
2545        tried_encodings.append(proposed_encoding)
2546        try:
2547            data = _toUTF8(data, proposed_encoding)
2548            known_encoding = use_strict_parser = 1
2549            break
2550        except:
2551            pass
2552    # if no luck and we have auto-detection library, try that
2553    if (not known_encoding) and chardet:
2554        try:
2555            proposed_encoding = chardet.detect(data)['encoding']
2556            if proposed_encoding and (proposed_encoding not in tried_encodings):
2557                tried_encodings.append(proposed_encoding)
2558                data = _toUTF8(data, proposed_encoding)
2559                known_encoding = use_strict_parser = 1
2560        except:
2561            pass
2562    # if still no luck and we haven't tried utf-8 yet, try that
2563    if (not known_encoding) and ('utf-8' not in tried_encodings):
2564        try:
2565            proposed_encoding = 'utf-8'
2566            tried_encodings.append(proposed_encoding)
2567            data = _toUTF8(data, proposed_encoding)
2568            known_encoding = use_strict_parser = 1
2569        except:
2570            pass
2571    # if still no luck and we haven't tried windows-1252 yet, try that
2572    if (not known_encoding) and ('windows-1252' not in tried_encodings):
2573        try:
2574            proposed_encoding = 'windows-1252'
2575            tried_encodings.append(proposed_encoding)
2576            data = _toUTF8(data, proposed_encoding)
2577            known_encoding = use_strict_parser = 1
2578        except:
2579            pass
2580    # if still no luck, give up
2581    if not known_encoding:
2582        result['bozo'] = 1
2583        result['bozo_exception'] = CharacterEncodingUnknown( \
2584            'document encoding unknown, I tried ' + \
2585            '%s, %s, utf-8, and windows-1252 but nothing worked' % \
2586            (result['encoding'], xml_encoding))
2587        result['encoding'] = ''
2588    elif proposed_encoding != result['encoding']:
2589        result['bozo'] = 1
2590        result['bozo_exception'] = CharacterEncodingOverride( \
2591            'documented declared as %s, but parsed as %s' % \
2592            (result['encoding'], proposed_encoding))
2593        result['encoding'] = proposed_encoding
2594
2595    if not _XML_AVAILABLE:
2596        use_strict_parser = 0
2597    if use_strict_parser:
2598        # initialize the SAX parser
2599        feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
2600        saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
2601        saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
2602        saxparser.setContentHandler(feedparser)
2603        saxparser.setErrorHandler(feedparser)
2604        source = xml.sax.xmlreader.InputSource()
2605        source.setByteStream(_StringIO(data))
2606        if hasattr(saxparser, '_ns_stack'):
2607            # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
2608            # PyXML doesn't have this problem, and it doesn't have _ns_stack either
2609            saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
2610        try:
2611            saxparser.parse(source)
2612        except Exception, e:
2613            if _debug:
2614                import traceback
2615                traceback.print_stack()
2616                traceback.print_exc()
2617                sys.stderr.write('xml parsing failed\n')
2618            result['bozo'] = 1
2619            result['bozo_exception'] = feedparser.exc or e
2620            use_strict_parser = 0
2621    if not use_strict_parser:
2622        feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')
2623        feedparser.feed(data)
2624    result['feed'] = feedparser.feeddata
2625    result['entries'] = feedparser.entries
2626    result['version'] = result['version'] or feedparser.version
2627    result['namespaces'] = feedparser.namespacesInUse
2628    return result
2629
2630if __name__ == '__main__':
2631    if not sys.argv[1:]:
2632        print __doc__
2633        sys.exit(0)
2634    else:
2635        urls = sys.argv[1:]
2636    zopeCompatibilityHack()
2637    from pprint import pprint
2638    for url in urls:
2639        print url
2640        print
2641        result = parse(url)
2642        pprint(result)
2643        print
2644
2645#REVISION HISTORY
2646#1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
2647#  added Simon Fell's test suite
2648#1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
2649#2.0 - 10/19/2002
2650#  JD - use inchannel to watch out for image and textinput elements which can
2651#  also contain title, link, and description elements
2652#  JD - check for isPermaLink='false' attribute on guid elements
2653#  JD - replaced openAnything with open_resource supporting ETag and
2654#  If-Modified-Since request headers
2655#  JD - parse now accepts etag, modified, agent, and referrer optional
2656#  arguments
2657#  JD - modified parse to return a dictionary instead of a tuple so that any
2658#  etag or modified information can be returned and cached by the caller
2659#2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
2660#  because of etag/modified, return the old etag/modified to the caller to
2661#  indicate why nothing is being returned
2662#2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
2663#  useless.  Fixes the problem JD was addressing by adding it.
2664#2.1 - 11/14/2002 - MAP - added gzip support
2665#2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
2666#  start_admingeneratoragent is an example of how to handle elements with
2667#  only attributes, no content.
2668#2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
2669#  also, make sure we send the User-Agent even if urllib2 isn't available.
2670#  Match any variation of backend.userland.com/rss namespace.
2671#2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
2672#2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
2673#  snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
2674#  project name
2675#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
2676#  removed unnecessary urllib code -- urllib2 should always be available anyway;
2677#  return actual url, status, and full HTTP headers (as result['url'],
2678#  result['status'], and result['headers']) if parsing a remote feed over HTTP --
2679#  this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
2680#  added the latest namespace-of-the-week for RSS 2.0
2681#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
2682#  User-Agent (otherwise urllib2 sends two, which confuses some servers)
2683#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
2684#  inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
2685#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
2686#  textInput, and also to return the character encoding (if specified)
2687#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
2688#  nested divs within content (JohnD); fixed missing sys import (JohanS);
2689#  fixed regular expression to capture XML character encoding (Andrei);
2690#  added support for Atom 0.3-style links; fixed bug with textInput tracking;
2691#  added support for cloud (MartijnP); added support for multiple
2692#  category/dc:subject (MartijnP); normalize content model: 'description' gets
2693#  description (which can come from description, summary, or full content if no
2694#  description), 'content' gets dict of base/language/type/value (which can come
2695#  from content:encoded, xhtml:body, content, or fullitem);
2696#  fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
2697#  tracking; fixed bug tracking unknown tags; fixed bug tracking content when
2698#  <content> element is not in default namespace (like Pocketsoap feed);
2699#  resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
2700#  wfw:commentRSS; resolve relative URLs within embedded HTML markup in
2701#  description, xhtml:body, content, content:encoded, title, subtitle,
2702#  summary, info, tagline, and copyright; added support for pingback and
2703#  trackback namespaces
2704#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
2705#  namespaces, as opposed to 2.6 when I said I did but didn't really;
2706#  sanitize HTML markup within some elements; added mxTidy support (if
2707#  installed) to tidy HTML markup within some elements; fixed indentation
2708#  bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
2709#  (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
2710#  'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
2711#  'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
2712#  and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
2713#2.7.1 - 1/9/2004 - MAP - fixed bug handling &quot; and &apos;.  fixed memory
2714#  leak not closing url opener (JohnD); added dc:publisher support (MarekK);
2715#  added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
2716#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
2717#  encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
2718#  fixed relative URI processing for guid (skadz); added ICBM support; added
2719#  base64 support
2720#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
2721#  blogspot.com sites); added _debug variable
2722#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
2723#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
2724#  added several new supported namespaces; fixed bug tracking naked markup in
2725#  description; added support for enclosure; added support for source; re-added
2726#  support for cloud which got dropped somehow; added support for expirationDate
2727#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
2728#  xml:base URI, one for documents that don't define one explicitly and one for
2729#  documents that define an outer and an inner xml:base that goes out of scope
2730#  before the end of the document
2731#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
2732#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']
2733#  will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
2734#  added support for creativeCommons:license and cc:license; added support for
2735#  full Atom content model in title, tagline, info, copyright, summary; fixed bug
2736#  with gzip encoding (not always telling server we support it when we do)
2737#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
2738#  (dictionary of 'name', 'url', 'email'); map author to author_detail if author
2739#  contains name + email address
2740#3.0b8 - 1/28/2004 - MAP - added support for contributor
2741#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
2742#  support for summary
2743#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
2744#  xml.util.iso8601
2745#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
2746#  dangerous markup; fiddled with decodeEntities (not right); liberalized
2747#  date parsing even further
2748#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
2749#  added support to Atom 0.2 subtitle; added support for Atom content model
2750#  in copyright; better sanitizing of dangerous HTML elements with end tags
2751#  (script, frameset)
2752#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
2753#  etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
2754#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
2755#  Python 2.1
2756#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
2757#  fixed bug capturing author and contributor URL; fixed bug resolving relative
2758#  links in author and contributor URL; fixed bug resolvin relative links in
2759#  generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
2760#  namespace tests, and included them permanently in the test suite with his
2761#  permission; fixed namespace handling under Python 2.1
2762#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
2763#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
2764#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
2765#  use libxml2 (if available)
2766#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
2767#  name was in parentheses; removed ultra-problematic mxTidy support; patch to
2768#  workaround crash in PyXML/expat when encountering invalid entities
2769#  (MarkMoraes); support for textinput/textInput
2770#3.0b20 - 4/7/2004 - MAP - added CDF support
2771#3.0b21 - 4/14/2004 - MAP - added Hot RSS support
2772#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
2773#  results dict; changed results dict to allow getting values with results.key
2774#  as well as results[key]; work around embedded illformed HTML with half
2775#  a DOCTYPE; work around malformed Content-Type header; if character encoding
2776#  is wrong, try several common ones before falling back to regexes (if this
2777#  works, bozo_exception is set to CharacterEncodingOverride); fixed character
2778#  encoding issues in BaseHTMLProcessor by tracking encoding and converting
2779#  from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
2780#  convert each value in results to Unicode (if possible), even if using
2781#  regex-based parsing
2782#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
2783#  high-bit characters in attributes in embedded HTML in description (thanks
2784#  Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
2785#  FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
2786#  about a mapped key
2787#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
2788#  results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
2789#  cause the same encoding to be tried twice (even if it failed the first time);
2790#  fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
2791#  better textinput and image tracking in illformed RSS 1.0 feeds
2792#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
2793#  my blink tag tests
2794#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
2795#  failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
2796#  duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
2797#  added support for image; refactored parse() fallback logic to try other
2798#  encodings if SAX parsing fails (previously it would only try other encodings
2799#  if re-encoding failed); remove unichr madness in normalize_attrs now that
2800#  we're properly tracking encoding in and out of BaseHTMLProcessor; set
2801#  feed.language from root-level xml:lang; set entry.id from rdf:about;
2802#  send Accept header
2803#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
2804#  iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
2805#  windows-1252); fixed regression that could cause the same encoding to be
2806#  tried twice (even if it failed the first time)
2807#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
2808#  recover from malformed content-type header parameter with no equals sign
2809#  ('text/xml; charset:iso-8859-1')
2810#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
2811#  to Unicode equivalents in illformed feeds (aaronsw); added and
2812#  passed tests for converting character entities to Unicode equivalents
2813#  in illformed feeds (aaronsw); test for valid parsers when setting
2814#  XML_AVAILABLE; make version and encoding available when server returns
2815#  a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
2816#  digest auth or proxy support); add code to parse username/password
2817#  out of url and send as basic authentication; expose downloading-related
2818#  exceptions in bozo_exception (aaronsw); added __contains__ method to
2819#  FeedParserDict (aaronsw); added publisher_detail (aaronsw)
2820#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
2821#  convert feed to UTF-8 before passing to XML parser; completely revamped
2822#  logic for determining character encoding and attempting XML parsing
2823#  (much faster); increased default timeout to 20 seconds; test for presence
2824#  of Location header on redirects; added tests for many alternate character
2825#  encodings; support various EBCDIC encodings; support UTF-16BE and
2826#  UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
2827#  UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
2828#  XML parsers are available; added support for 'Content-encoding: deflate';
2829#  send blank 'Accept-encoding: ' header if neither gzip nor zlib modules
2830#  are available
2831#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
2832#  problem tracking xml:base and xml:lang if element declares it, child
2833#  doesn't, first grandchild redeclares it, and second grandchild doesn't;
2834#  refactored date parsing; defined public registerDateHandler so callers
2835#  can add support for additional date formats at runtime; added support
2836#  for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
2837#  zopeCompatibilityHack() which turns FeedParserDict into a regular
2838#  dictionary, required for Zope compatibility, and also makes command-
2839#  line debugging easier because pprint module formats real dictionaries
2840#  better than dictionary-like objects; added NonXMLContentType exception,
2841#  which is stored in bozo_exception when a feed is served with a non-XML
2842#  media type such as 'text/plain'; respect Content-Language as default
2843#  language if not xml:lang is present; cloud dict is now FeedParserDict;
2844#  generator dict is now FeedParserDict; better tracking of xml:lang,
2845#  including support for xml:lang='' to unset the current language;
2846#  recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
2847#  namespace; don't overwrite final status on redirects (scenarios:
2848#  redirecting to a URL that returns 304, redirecting to a URL that
2849#  redirects to another URL with a different type of redirect); add
2850#  support for HTTP 303 redirects
2851#4.0 - MAP - support for relative URIs in xml:base attribute; fixed
2852#  encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;
2853#  support for Atom 1.0; support for iTunes extensions; new 'tags' for
2854#  categories/keywords/etc. as array of dict
2855#  {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0
2856#  terminology; parse RFC 822-style dates with no time; lots of other
2857#  bug fixes
2858#4.1 - MAP - removed socket timeout; added support for chardet library
Note: See TracBrowser for help on using the repository browser.