source: subversion/applications/routing/pyroute-dev/feedparser.py @ 34690

Last change on this file since 34690 was 18449, checked in by buerste, 10 years ago

-adding rev replacement

  • Property svn:keywords set to Rev
File size: 122.9 KB
Line 
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3"""Universal feed parser
4
5Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
6
7Visit http://feedparser.org/ for the latest version
8Visit http://feedparser.org/docs/ for the latest documentation
9
10Required: Python 2.1 or later
11Recommended: Python 2.3 or later
12Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
13"""
14
15__version__ = "4.1"# + "$Revision: 18449 $"[11:15] + "-cvs"
16__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
17
18Redistribution and use in source and binary forms, with or without modification,
19are permitted provided that the following conditions are met:
20
21* Redistributions of source code must retain the above copyright notice,
22  this list of conditions and the following disclaimer.
23* Redistributions in binary form must reproduce the above copyright notice,
24  this list of conditions and the following disclaimer in the documentation
25  and/or other materials provided with the distribution.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE."""
38__author__ = "Mark Pilgrim <http://diveintomark.org/>"
39__contributors__ = ["Jason Diamond <http://injektilo.org/>",
40                    "John Beimler <http://john.beimler.org/>",
41                    "Fazal Majid <http://www.majid.info/mylos/weblog/>",
42                    "Aaron Swartz <http://aaronsw.com/>",
43                    "Kevin Marks <http://epeus.blogspot.com/>"]
44_debug = 0
45
46# HTTP "User-Agent" header to send to servers when downloading feeds.
47# If you are embedding feedparser in a larger application, you should
48# change this to your application name and URL.
49USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
50
51# HTTP "Accept" header to send to servers when downloading feeds.  If you don't
52# want to send an Accept header, set this to None.
53ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
54
55# List of preferred XML parsers, by SAX driver name.  These will be tried first,
56# but if they're not installed, Python will keep searching through its own list
57# of pre-installed parsers until it finds one that supports everything we need.
58PREFERRED_XML_PARSERS = ["drv_libxml2"]
59
60# If you want feedparser to automatically run HTML markup through HTML Tidy, set
61# this to 1.  Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
62# or utidylib <http://utidylib.berlios.de/>.
63TIDY_MARKUP = 0
64
65# List of Python interfaces for HTML Tidy, in order of preference.  Only useful
66# if TIDY_MARKUP = 1
67PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
68
69# ---------- required modules (should come with any Python distribution) ----------
70import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
71try:
72    from cStringIO import StringIO as _StringIO
73except:
74    from StringIO import StringIO as _StringIO
75
76# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
77
78# gzip is included with most Python distributions, but may not be available if you compiled your own
79try:
80    import gzip
81except:
82    gzip = None
83try:
84    import zlib
85except:
86    zlib = None
87
88# If a real XML parser is available, feedparser will attempt to use it.  feedparser has
89# been tested with the built-in SAX parser, PyXML, and libxml2.  On platforms where the
90# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
91# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
92try:
93    import xml.sax
94    xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
95    from xml.sax.saxutils import escape as _xmlescape
96    _XML_AVAILABLE = 1
97except:
98    _XML_AVAILABLE = 0
99    def _xmlescape(data):
100        data = data.replace('&', '&amp;')
101        data = data.replace('>', '&gt;')
102        data = data.replace('<', '&lt;')
103        return data
104
105# base64 support for Atom feeds that contain embedded binary data
106try:
107    import base64, binascii
108except:
109    base64 = binascii = None
110
111# cjkcodecs and iconv_codec provide support for more character encodings.
112# Both are available from http://cjkpython.i18n.org/
113try:
114    import cjkcodecs.aliases
115except:
116    pass
117try:
118    import iconv_codec
119except:
120    pass
121
122# chardet library auto-detects character encodings
123# Download from http://chardet.feedparser.org/
124try:
125    import chardet
126    if _debug:
127        import chardet.constants
128        chardet.constants._debug = 1
129except:
130    chardet = None
131
132# ---------- don't touch these ----------
133class ThingsNobodyCaresAboutButMe(Exception): pass
134class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
135class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
136class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
137class UndeclaredNamespace(Exception): pass
138
139sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
140sgmllib.special = re.compile('<!')
141sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
142
143SUPPORTED_VERSIONS = {'': 'unknown',
144                      'rss090': 'RSS 0.90',
145                      'rss091n': 'RSS 0.91 (Netscape)',
146                      'rss091u': 'RSS 0.91 (Userland)',
147                      'rss092': 'RSS 0.92',
148                      'rss093': 'RSS 0.93',
149                      'rss094': 'RSS 0.94',
150                      'rss20': 'RSS 2.0',
151                      'rss10': 'RSS 1.0',
152                      'rss': 'RSS (unknown version)',
153                      'atom01': 'Atom 0.1',
154                      'atom02': 'Atom 0.2',
155                      'atom03': 'Atom 0.3',
156                      'atom10': 'Atom 1.0',
157                      'atom': 'Atom (unknown version)',
158                      'cdf': 'CDF',
159                      'hotrss': 'Hot RSS'
160                      }
161
162try:
163    UserDict = dict
164except NameError:
165    # Python 2.1 does not have dict
166    from UserDict import UserDict
167    def dict(aList):
168        rc = {}
169        for k, v in aList:
170            rc[k] = v
171        return rc
172
173class FeedParserDict(UserDict):
174    keymap = {'channel': 'feed',
175              'items': 'entries',
176              'guid': 'id',
177              'date': 'updated',
178              'date_parsed': 'updated_parsed',
179              'description': ['subtitle', 'summary'],
180              'url': ['href'],
181              'modified': 'updated',
182              'modified_parsed': 'updated_parsed',
183              'issued': 'published',
184              'issued_parsed': 'published_parsed',
185              'copyright': 'rights',
186              'copyright_detail': 'rights_detail',
187              'tagline': 'subtitle',
188              'tagline_detail': 'subtitle_detail'}
189    def __getitem__(self, key):
190        if key == 'category':
191            return UserDict.__getitem__(self, 'tags')[0]['term']
192        if key == 'categories':
193            return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
194        realkey = self.keymap.get(key, key)
195        if type(realkey) == types.ListType:
196            for k in realkey:
197                if UserDict.has_key(self, k):
198                    return UserDict.__getitem__(self, k)
199        if UserDict.has_key(self, key):
200            return UserDict.__getitem__(self, key)
201        return UserDict.__getitem__(self, realkey)
202
203    def __setitem__(self, key, value):
204        for k in self.keymap.keys():
205            if key == k:
206                key = self.keymap[k]
207                if type(key) == types.ListType:
208                    key = key[0]
209        return UserDict.__setitem__(self, key, value)
210
211    def get(self, key, default=None):
212        if self.has_key(key):
213            return self[key]
214        else:
215            return default
216
217    def setdefault(self, key, value):
218        if not self.has_key(key):
219            self[key] = value
220        return self[key]
221       
222    def has_key(self, key):
223        try:
224            return hasattr(self, key) or UserDict.has_key(self, key)
225        except AttributeError:
226            return False
227       
228    def __getattr__(self, key):
229        try:
230            return self.__dict__[key]
231        except KeyError:
232            pass
233        try:
234            assert not key.startswith('_')
235            return self.__getitem__(key)
236        except:
237            raise AttributeError, "object has no attribute '%s'" % key
238
239    def __setattr__(self, key, value):
240        if key.startswith('_') or key == 'data':
241            self.__dict__[key] = value
242        else:
243            return self.__setitem__(key, value)
244
245    def __contains__(self, key):
246        return self.has_key(key)
247
248def zopeCompatibilityHack():
249    global FeedParserDict
250    del FeedParserDict
251    def FeedParserDict(aDict=None):
252        rc = {}
253        if aDict:
254            rc.update(aDict)
255        return rc
256
257_ebcdic_to_ascii_map = None
258def _ebcdic_to_ascii(s):
259    global _ebcdic_to_ascii_map
260    if not _ebcdic_to_ascii_map:
261        emap = (
262            0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
263            16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
264            128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
265            144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
266            32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
267            38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
268            45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
269            186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
270            195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
271            202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
272            209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
273            216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
274            123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
275            125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
276            92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
277            48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
278            )
279        import string
280        _ebcdic_to_ascii_map = string.maketrans( \
281            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
282    return s.translate(_ebcdic_to_ascii_map)
283
284_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
285def _urljoin(base, uri):
286    uri = _urifixer.sub(r'\1\3', uri)
287    return urlparse.urljoin(base, uri)
288
289class _FeedParserMixin:
290    namespaces = {'': '',
291                  'http://backend.userland.com/rss': '',
292                  'http://blogs.law.harvard.edu/tech/rss': '',
293                  'http://purl.org/rss/1.0/': '',
294                  'http://my.netscape.com/rdf/simple/0.9/': '',
295                  'http://example.com/newformat#': '',
296                  'http://example.com/necho': '',
297                  'http://purl.org/echo/': '',
298                  'uri/of/echo/namespace#': '',
299                  'http://purl.org/pie/': '',
300                  'http://purl.org/atom/ns#': '',
301                  'http://www.w3.org/2005/Atom': '',
302                  'http://purl.org/rss/1.0/modules/rss091#': '',
303                 
304                  'http://webns.net/mvcb/':                               'admin',
305                  'http://purl.org/rss/1.0/modules/aggregation/':         'ag',
306                  'http://purl.org/rss/1.0/modules/annotate/':            'annotate',
307                  'http://media.tangent.org/rss/1.0/':                    'audio',
308                  'http://backend.userland.com/blogChannelModule':        'blogChannel',
309                  'http://web.resource.org/cc/':                          'cc',
310                  'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
311                  'http://purl.org/rss/1.0/modules/company':              'co',
312                  'http://purl.org/rss/1.0/modules/content/':             'content',
313                  'http://my.theinfo.org/changed/1.0/rss/':               'cp',
314                  'http://purl.org/dc/elements/1.1/':                     'dc',
315                  'http://purl.org/dc/terms/':                            'dcterms',
316                  'http://purl.org/rss/1.0/modules/email/':               'email',
317                  'http://purl.org/rss/1.0/modules/event/':               'ev',
318                  'http://rssnamespace.org/feedburner/ext/1.0':           'feedburner',
319                  'http://freshmeat.net/rss/fm/':                         'fm',
320                  'http://xmlns.com/foaf/0.1/':                           'foaf',
321                  'http://www.w3.org/2003/01/geo/wgs84_pos#':             'geo',
322                  'http://postneo.com/icbm/':                             'icbm',
323                  'http://purl.org/rss/1.0/modules/image/':               'image',
324                  'http://www.itunes.com/DTDs/PodCast-1.0.dtd':           'itunes',
325                  'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
326                  'http://purl.org/rss/1.0/modules/link/':                'l',
327                  'http://search.yahoo.com/mrss':                         'media',
328                  'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
329                  'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
330                  'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
331                  'http://www.w3.org/2000/01/rdf-schema#':                'rdfs',
332                  'http://purl.org/rss/1.0/modules/reference/':           'ref',
333                  'http://purl.org/rss/1.0/modules/richequiv/':           'reqv',
334                  'http://purl.org/rss/1.0/modules/search/':              'search',
335                  'http://purl.org/rss/1.0/modules/slash/':               'slash',
336                  'http://schemas.xmlsoap.org/soap/envelope/':            'soap',
337                  'http://purl.org/rss/1.0/modules/servicestatus/':       'ss',
338                  'http://hacks.benhammersley.com/rss/streaming/':        'str',
339                  'http://purl.org/rss/1.0/modules/subscription/':        'sub',
340                  'http://purl.org/rss/1.0/modules/syndication/':         'sy',
341                  'http://purl.org/rss/1.0/modules/taxonomy/':            'taxo',
342                  'http://purl.org/rss/1.0/modules/threading/':           'thr',
343                  'http://purl.org/rss/1.0/modules/textinput/':           'ti',
344                  'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
345                  'http://wellformedweb.org/commentAPI/':                 'wfw',
346                  'http://purl.org/rss/1.0/modules/wiki/':                'wiki',
347                  'http://www.w3.org/1999/xhtml':                         'xhtml',
348                  'http://www.w3.org/XML/1998/namespace':                 'xml',
349                  'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf'
350}
351    _matchnamespaces = {}
352
353    can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
354    can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
355    can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
356    html_types = ['text/html', 'application/xhtml+xml']
357   
358    def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
359        if _debug: sys.stderr.write('initializing FeedParser\n')
360        if not self._matchnamespaces:
361            for k, v in self.namespaces.items():
362                self._matchnamespaces[k.lower()] = v
363        self.feeddata = FeedParserDict() # feed-level data
364        self.encoding = encoding # character encoding
365        self.entries = [] # list of entry-level data
366        self.version = '' # feed type/version, see SUPPORTED_VERSIONS
367        self.namespacesInUse = {} # dictionary of namespaces defined by the feed
368
369        # the following are used internally to track state;
370        # this is really out of control and should be refactored
371        self.infeed = 0
372        self.inentry = 0
373        self.incontent = 0
374        self.intextinput = 0
375        self.inimage = 0
376        self.inauthor = 0
377        self.incontributor = 0
378        self.inpublisher = 0
379        self.insource = 0
380        self.sourcedata = FeedParserDict()
381        self.contentparams = FeedParserDict()
382        self._summaryKey = None
383        self.namespacemap = {}
384        self.elementstack = []
385        self.basestack = []
386        self.langstack = []
387        self.baseuri = baseuri or ''
388        self.lang = baselang or None
389        if baselang:
390            self.feeddata['language'] = baselang
391
392    def unknown_starttag(self, tag, attrs):
393        if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
394        # normalize attrs
395        attrs = [(k.lower(), v) for k, v in attrs]
396        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
397       
398        # track xml:base and xml:lang
399        attrsD = dict(attrs)
400        baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
401        self.baseuri = _urljoin(self.baseuri, baseuri)
402        lang = attrsD.get('xml:lang', attrsD.get('lang'))
403        if lang == '':
404            # xml:lang could be explicitly set to '', we need to capture that
405            lang = None
406        elif lang is None:
407            # if no xml:lang is specified, use parent lang
408            lang = self.lang
409        if lang:
410            if tag in ('feed', 'rss', 'rdf:RDF'):
411                self.feeddata['language'] = lang
412        self.lang = lang
413        self.basestack.append(self.baseuri)
414        self.langstack.append(lang)
415       
416        # track namespaces
417        for prefix, uri in attrs:
418            if prefix.startswith('xmlns:'):
419                self.trackNamespace(prefix[6:], uri)
420            elif prefix == 'xmlns':
421                self.trackNamespace(None, uri)
422
423        # track inline content
424        if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
425            # element declared itself as escaped markup, but it isn't really
426            self.contentparams['type'] = 'application/xhtml+xml'
427        if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
428            # Note: probably shouldn't simply recreate localname here, but
429            # our namespace handling isn't actually 100% correct in cases where
430            # the feed redefines the default namespace (which is actually
431            # the usual case for inline content, thanks Sam), so here we
432            # cheat and just reconstruct the element based on localname
433            # because that compensates for the bugs in our namespace handling.
434            # This will horribly munge inline content with non-empty qnames,
435            # but nobody actually does that, so I'm not fixing it.
436            tag = tag.split(':')[-1]
437            return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
438
439        # match namespaces
440        if tag.find(':') <> -1:
441            prefix, suffix = tag.split(':', 1)
442        else:
443            prefix, suffix = '', tag
444        prefix = self.namespacemap.get(prefix, prefix)
445        if prefix:
446            prefix = prefix + '_'
447
448        # special hack for better tracking of empty textinput/image elements in illformed feeds
449        if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
450            self.intextinput = 0
451        if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
452            self.inimage = 0
453       
454        # call special handler (if defined) or default handler
455        methodname = '_start_' + prefix + suffix
456        try:
457            method = getattr(self, methodname)
458            return method(attrsD)
459        except AttributeError:
460            return self.push(prefix + suffix, 1)
461
462    def unknown_endtag(self, tag):
463        if _debug: sys.stderr.write('end %s\n' % tag)
464        # match namespaces
465        if tag.find(':') <> -1:
466            prefix, suffix = tag.split(':', 1)
467        else:
468            prefix, suffix = '', tag
469        prefix = self.namespacemap.get(prefix, prefix)
470        if prefix:
471            prefix = prefix + '_'
472
473        # call special handler (if defined) or default handler
474        methodname = '_end_' + prefix + suffix
475        try:
476            method = getattr(self, methodname)
477            method()
478        except AttributeError:
479            self.pop(prefix + suffix)
480
481        # track inline content
482        if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
483            # element declared itself as escaped markup, but it isn't really
484            self.contentparams['type'] = 'application/xhtml+xml'
485        if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
486            tag = tag.split(':')[-1]
487            self.handle_data('</%s>' % tag, escape=0)
488
489        # track xml:base and xml:lang going out of scope
490        if self.basestack:
491            self.basestack.pop()
492            if self.basestack and self.basestack[-1]:
493                self.baseuri = self.basestack[-1]
494        if self.langstack:
495            self.langstack.pop()
496            if self.langstack: # and (self.langstack[-1] is not None):
497                self.lang = self.langstack[-1]
498
499    def handle_charref(self, ref):
500        # called for each character reference, e.g. for '&#160;', ref will be '160'
501        if not self.elementstack: return
502        ref = ref.lower()
503        if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
504            text = '&#%s;' % ref
505        else:
506            if ref[0] == 'x':
507                c = int(ref[1:], 16)
508            else:
509                c = int(ref)
510            text = unichr(c).encode('utf-8')
511        self.elementstack[-1][2].append(text)
512
513    def handle_entityref(self, ref):
514        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
515        if not self.elementstack: return
516        if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
517        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
518            text = '&%s;' % ref
519        else:
520            # entity resolution graciously donated by Aaron Swartz
521            def name2cp(k):
522                import htmlentitydefs
523                if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
524                    return htmlentitydefs.name2codepoint[k]
525                k = htmlentitydefs.entitydefs[k]
526                if k.startswith('&#') and k.endswith(';'):
527                    return int(k[2:-1]) # not in latin-1
528                return ord(k)
529            try: name2cp(ref)
530            except KeyError: text = '&%s;' % ref
531            else: text = unichr(name2cp(ref)).encode('utf-8')
532        self.elementstack[-1][2].append(text)
533
534    def handle_data(self, text, escape=1):
535        # called for each block of plain text, i.e. outside of any tag and
536        # not containing any character or entity references
537        if not self.elementstack: return
538        if escape and self.contentparams.get('type') == 'application/xhtml+xml':
539            text = _xmlescape(text)
540        self.elementstack[-1][2].append(text)
541
542    def handle_comment(self, text):
543        # called for each comment, e.g. <!-- insert message here -->
544        pass
545
546    def handle_pi(self, text):
547        # called for each processing instruction, e.g. <?instruction>
548        pass
549
550    def handle_decl(self, text):
551        pass
552
553    def parse_declaration(self, i):
554        # override internal declaration handler to handle CDATA blocks
555        if _debug: sys.stderr.write('entering parse_declaration\n')
556        if self.rawdata[i:i+9] == '<![CDATA[':
557            k = self.rawdata.find(']]>', i)
558            if k == -1: k = len(self.rawdata)
559            self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
560            return k+3
561        else:
562            k = self.rawdata.find('>', i)
563            return k+1
564
565    def mapContentType(self, contentType):
566        contentType = contentType.lower()
567        if contentType == 'text':
568            contentType = 'text/plain'
569        elif contentType == 'html':
570            contentType = 'text/html'
571        elif contentType == 'xhtml':
572            contentType = 'application/xhtml+xml'
573        return contentType
574   
575    def trackNamespace(self, prefix, uri):
576        loweruri = uri.lower()
577        if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
578            self.version = 'rss090'
579        if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
580            self.version = 'rss10'
581        if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
582            self.version = 'atom10'
583        if loweruri.find('backend.userland.com/rss') <> -1:
584            # match any backend.userland.com namespace
585            uri = 'http://backend.userland.com/rss'
586            loweruri = uri
587        if self._matchnamespaces.has_key(loweruri):
588            self.namespacemap[prefix] = self._matchnamespaces[loweruri]
589            self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
590        else:
591            self.namespacesInUse[prefix or ''] = uri
592
593    def resolveURI(self, uri):
594        return _urljoin(self.baseuri or '', uri)
595   
596    def decodeEntities(self, element, data):
597        return data
598
599    def push(self, element, expectingText):
600        self.elementstack.append([element, expectingText, []])
601
602    def pop(self, element, stripWhitespace=1):
603        if not self.elementstack: return
604        if self.elementstack[-1][0] != element: return
605       
606        element, expectingText, pieces = self.elementstack.pop()
607        output = ''.join(pieces)
608        if stripWhitespace:
609            output = output.strip()
610        if not expectingText: return output
611
612        # decode base64 content
613        if base64 and self.contentparams.get('base64', 0):
614            try:
615                output = base64.decodestring(output)
616            except binascii.Error:
617                pass
618            except binascii.Incomplete:
619                pass
620               
621        # resolve relative URIs
622        if (element in self.can_be_relative_uri) and output:
623            output = self.resolveURI(output)
624       
625        # decode entities within embedded markup
626        if not self.contentparams.get('base64', 0):
627            output = self.decodeEntities(element, output)
628
629        # remove temporary cruft from contentparams
630        try:
631            del self.contentparams['mode']
632        except KeyError:
633            pass
634        try:
635            del self.contentparams['base64']
636        except KeyError:
637            pass
638
639        # resolve relative URIs within embedded markup
640        if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
641            if element in self.can_contain_relative_uris:
642                output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
643       
644        # sanitize embedded markup
645        if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
646            if element in self.can_contain_dangerous_markup:
647                output = _sanitizeHTML(output, self.encoding)
648
649        if self.encoding and type(output) != type(u''):
650            try:
651                output = unicode(output, self.encoding)
652            except:
653                pass
654
655        # categories/tags/keywords/whatever are handled in _end_category
656        if element == 'category':
657            return output
658       
659        # store output in appropriate place(s)
660        if self.inentry and not self.insource:
661            if element == 'content':
662                self.entries[-1].setdefault(element, [])
663                contentparams = copy.deepcopy(self.contentparams)
664                contentparams['value'] = output
665                self.entries[-1][element].append(contentparams)
666            elif element == 'link':
667                self.entries[-1][element] = output
668                if output:
669                    self.entries[-1]['links'][-1]['href'] = output
670            else:
671                if element == 'description':
672                    element = 'summary'
673                self.entries[-1][element] = output
674                if self.incontent:
675                    contentparams = copy.deepcopy(self.contentparams)
676                    contentparams['value'] = output
677                    self.entries[-1][element + '_detail'] = contentparams
678        elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):
679            context = self._getContext()
680            if element == 'description':
681                element = 'subtitle'
682            context[element] = output
683            if element == 'link':
684                context['links'][-1]['href'] = output
685            elif self.incontent:
686                contentparams = copy.deepcopy(self.contentparams)
687                contentparams['value'] = output
688                context[element + '_detail'] = contentparams
689        return output
690
691    def pushContent(self, tag, attrsD, defaultContentType, expectingText):
692        self.incontent += 1
693        self.contentparams = FeedParserDict({
694            'type': self.mapContentType(attrsD.get('type', defaultContentType)),
695            'language': self.lang,
696            'base': self.baseuri})
697        self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
698        self.push(tag, expectingText)
699
700    def popContent(self, tag):
701        value = self.pop(tag)
702        self.incontent -= 1
703        self.contentparams.clear()
704        return value
705       
706    def _mapToStandardPrefix(self, name):
707        colonpos = name.find(':')
708        if colonpos <> -1:
709            prefix = name[:colonpos]
710            suffix = name[colonpos+1:]
711            prefix = self.namespacemap.get(prefix, prefix)
712            name = prefix + ':' + suffix
713        return name
714       
715    def _getAttribute(self, attrsD, name):
716        return attrsD.get(self._mapToStandardPrefix(name))
717
718    def _isBase64(self, attrsD, contentparams):
719        if attrsD.get('mode', '') == 'base64':
720            return 1
721        if self.contentparams['type'].startswith('text/'):
722            return 0
723        if self.contentparams['type'].endswith('+xml'):
724            return 0
725        if self.contentparams['type'].endswith('/xml'):
726            return 0
727        return 1
728
729    def _itsAnHrefDamnIt(self, attrsD):
730        href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
731        if href:
732            try:
733                del attrsD['url']
734            except KeyError:
735                pass
736            try:
737                del attrsD['uri']
738            except KeyError:
739                pass
740            attrsD['href'] = href
741        return attrsD
742   
743    def _save(self, key, value):
744        context = self._getContext()
745        context.setdefault(key, value)
746
747    def _start_rss(self, attrsD):
748        versionmap = {'0.91': 'rss091u',
749                      '0.92': 'rss092',
750                      '0.93': 'rss093',
751                      '0.94': 'rss094'}
752        if not self.version:
753            attr_version = attrsD.get('version', '')
754            version = versionmap.get(attr_version)
755            if version:
756                self.version = version
757            elif attr_version.startswith('2.'):
758                self.version = 'rss20'
759            else:
760                self.version = 'rss'
761   
762    def _start_dlhottitles(self, attrsD):
763        self.version = 'hotrss'
764
765    def _start_channel(self, attrsD):
766        self.infeed = 1
767        self._cdf_common(attrsD)
768    _start_feedinfo = _start_channel
769
770    def _cdf_common(self, attrsD):
771        if attrsD.has_key('lastmod'):
772            self._start_modified({})
773            self.elementstack[-1][-1] = attrsD['lastmod']
774            self._end_modified()
775        if attrsD.has_key('href'):
776            self._start_link({})
777            self.elementstack[-1][-1] = attrsD['href']
778            self._end_link()
779   
780    def _start_feed(self, attrsD):
781        self.infeed = 1
782        versionmap = {'0.1': 'atom01',
783                      '0.2': 'atom02',
784                      '0.3': 'atom03'}
785        if not self.version:
786            attr_version = attrsD.get('version')
787            version = versionmap.get(attr_version)
788            if version:
789                self.version = version
790            else:
791                self.version = 'atom'
792
793    def _end_channel(self):
794        self.infeed = 0
795    _end_feed = _end_channel
796   
797    def _start_image(self, attrsD):
798        self.inimage = 1
799        self.push('image', 0)
800        context = self._getContext()
801        context.setdefault('image', FeedParserDict())
802           
803    def _end_image(self):
804        self.pop('image')
805        self.inimage = 0
806
807    def _start_textinput(self, attrsD):
808        self.intextinput = 1
809        self.push('textinput', 0)
810        context = self._getContext()
811        context.setdefault('textinput', FeedParserDict())
812    _start_textInput = _start_textinput
813   
814    def _end_textinput(self):
815        self.pop('textinput')
816        self.intextinput = 0
817    _end_textInput = _end_textinput
818
819    def _start_author(self, attrsD):
820        self.inauthor = 1
821        self.push('author', 1)
822    _start_managingeditor = _start_author
823    _start_dc_author = _start_author
824    _start_dc_creator = _start_author
825    _start_itunes_author = _start_author
826
827    def _end_author(self):
828        self.pop('author')
829        self.inauthor = 0
830        self._sync_author_detail()
831    _end_managingeditor = _end_author
832    _end_dc_author = _end_author
833    _end_dc_creator = _end_author
834    _end_itunes_author = _end_author
835
836    def _start_itunes_owner(self, attrsD):
837        self.inpublisher = 1
838        self.push('publisher', 0)
839
840    def _end_itunes_owner(self):
841        self.pop('publisher')
842        self.inpublisher = 0
843        self._sync_author_detail('publisher')
844
845    def _start_contributor(self, attrsD):
846        self.incontributor = 1
847        context = self._getContext()
848        context.setdefault('contributors', [])
849        context['contributors'].append(FeedParserDict())
850        self.push('contributor', 0)
851
852    def _end_contributor(self):
853        self.pop('contributor')
854        self.incontributor = 0
855
856    def _start_dc_contributor(self, attrsD):
857        self.incontributor = 1
858        context = self._getContext()
859        context.setdefault('contributors', [])
860        context['contributors'].append(FeedParserDict())
861        self.push('name', 0)
862
863    def _end_dc_contributor(self):
864        self._end_name()
865        self.incontributor = 0
866
867    def _start_name(self, attrsD):
868        self.push('name', 0)
869    _start_itunes_name = _start_name
870
871    def _end_name(self):
872        value = self.pop('name')
873        if self.inpublisher:
874            self._save_author('name', value, 'publisher')
875        elif self.inauthor:
876            self._save_author('name', value)
877        elif self.incontributor:
878            self._save_contributor('name', value)
879        elif self.intextinput:
880            context = self._getContext()
881            context['textinput']['name'] = value
882    _end_itunes_name = _end_name
883
884    def _start_width(self, attrsD):
885        self.push('width', 0)
886
887    def _end_width(self):
888        value = self.pop('width')
889        try:
890            value = int(value)
891        except:
892            value = 0
893        if self.inimage:
894            context = self._getContext()
895            context['image']['width'] = value
896
897    def _start_height(self, attrsD):
898        self.push('height', 0)
899
900    def _end_height(self):
901        value = self.pop('height')
902        try:
903            value = int(value)
904        except:
905            value = 0
906        if self.inimage:
907            context = self._getContext()
908            context['image']['height'] = value
909
910    def _start_url(self, attrsD):
911        self.push('href', 1)
912    _start_homepage = _start_url
913    _start_uri = _start_url
914
915    def _end_url(self):
916        value = self.pop('href')
917        if self.inauthor:
918            self._save_author('href', value)
919        elif self.incontributor:
920            self._save_contributor('href', value)
921        elif self.inimage:
922            context = self._getContext()
923            context['image']['href'] = value
924        elif self.intextinput:
925            context = self._getContext()
926            context['textinput']['link'] = value
927    _end_homepage = _end_url
928    _end_uri = _end_url
929
930    def _start_email(self, attrsD):
931        self.push('email', 0)
932    _start_itunes_email = _start_email
933
934    def _end_email(self):
935        value = self.pop('email')
936        if self.inpublisher:
937            self._save_author('email', value, 'publisher')
938        elif self.inauthor:
939            self._save_author('email', value)
940        elif self.incontributor:
941            self._save_contributor('email', value)
942    _end_itunes_email = _end_email
943
944    def _getContext(self):
945        if self.insource:
946            context = self.sourcedata
947        elif self.inentry:
948            context = self.entries[-1]
949        else:
950            context = self.feeddata
951        return context
952
953    def _save_author(self, key, value, prefix='author'):
954        context = self._getContext()
955        context.setdefault(prefix + '_detail', FeedParserDict())
956        context[prefix + '_detail'][key] = value
957        self._sync_author_detail()
958
959    def _save_contributor(self, key, value):
960        context = self._getContext()
961        context.setdefault('contributors', [FeedParserDict()])
962        context['contributors'][-1][key] = value
963
964    def _sync_author_detail(self, key='author'):
965        context = self._getContext()
966        detail = context.get('%s_detail' % key)
967        if detail:
968            name = detail.get('name')
969            email = detail.get('email')
970            if name and email:
971                context[key] = '%s (%s)' % (name, email)
972            elif name:
973                context[key] = name
974            elif email:
975                context[key] = email
976        else:
977            author = context.get(key)
978            if not author: return
979            emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
980            if not emailmatch: return
981            email = emailmatch.group(0)
982            # probably a better way to do the following, but it passes all the tests
983            author = author.replace(email, '')
984            author = author.replace('()', '')
985            author = author.strip()
986            if author and (author[0] == '('):
987                author = author[1:]
988            if author and (author[-1] == ')'):
989                author = author[:-1]
990            author = author.strip()
991            context.setdefault('%s_detail' % key, FeedParserDict())
992            context['%s_detail' % key]['name'] = author
993            context['%s_detail' % key]['email'] = email
994
995    def _start_subtitle(self, attrsD):
996        self.pushContent('subtitle', attrsD, 'text/plain', 1)
997    _start_tagline = _start_subtitle
998    _start_itunes_subtitle = _start_subtitle
999
1000    def _end_subtitle(self):
1001        self.popContent('subtitle')
1002    _end_tagline = _end_subtitle
1003    _end_itunes_subtitle = _end_subtitle
1004           
1005    def _start_rights(self, attrsD):
1006        self.pushContent('rights', attrsD, 'text/plain', 1)
1007    _start_dc_rights = _start_rights
1008    _start_copyright = _start_rights
1009
1010    def _end_rights(self):
1011        self.popContent('rights')
1012    _end_dc_rights = _end_rights
1013    _end_copyright = _end_rights
1014
1015    def _start_item(self, attrsD):
1016        self.entries.append(FeedParserDict())
1017        self.push('item', 0)
1018        self.inentry = 1
1019        self.guidislink = 0
1020        id = self._getAttribute(attrsD, 'rdf:about')
1021        if id:
1022            context = self._getContext()
1023            context['id'] = id
1024        self._cdf_common(attrsD)
1025    _start_entry = _start_item
1026    _start_product = _start_item
1027
1028    def _end_item(self):
1029        self.pop('item')
1030        self.inentry = 0
1031    _end_entry = _end_item
1032
1033    def _start_dc_language(self, attrsD):
1034        self.push('language', 1)
1035    _start_language = _start_dc_language
1036
1037    def _end_dc_language(self):
1038        self.lang = self.pop('language')
1039    _end_language = _end_dc_language
1040
1041    def _start_dc_publisher(self, attrsD):
1042        self.push('publisher', 1)
1043    _start_webmaster = _start_dc_publisher
1044
1045    def _end_dc_publisher(self):
1046        self.pop('publisher')
1047        self._sync_author_detail('publisher')
1048    _end_webmaster = _end_dc_publisher
1049
1050    def _start_published(self, attrsD):
1051        self.push('published', 1)
1052    _start_dcterms_issued = _start_published
1053    _start_issued = _start_published
1054
1055    def _end_published(self):
1056        value = self.pop('published')
1057        self._save('published_parsed', _parse_date(value))
1058    _end_dcterms_issued = _end_published
1059    _end_issued = _end_published
1060
1061    def _start_updated(self, attrsD):
1062        self.push('updated', 1)
1063    _start_modified = _start_updated
1064    _start_dcterms_modified = _start_updated
1065    _start_pubdate = _start_updated
1066    _start_dc_date = _start_updated
1067
1068    def _end_updated(self):
1069        value = self.pop('updated')
1070        parsed_value = _parse_date(value)
1071        self._save('updated_parsed', parsed_value)
1072    _end_modified = _end_updated
1073    _end_dcterms_modified = _end_updated
1074    _end_pubdate = _end_updated
1075    _end_dc_date = _end_updated
1076
1077    def _start_created(self, attrsD):
1078        self.push('created', 1)
1079    _start_dcterms_created = _start_created
1080
1081    def _end_created(self):
1082        value = self.pop('created')
1083        self._save('created_parsed', _parse_date(value))
1084    _end_dcterms_created = _end_created
1085
1086    def _start_expirationdate(self, attrsD):
1087        self.push('expired', 1)
1088
1089    def _end_expirationdate(self):
1090        self._save('expired_parsed', _parse_date(self.pop('expired')))
1091
1092    def _start_cc_license(self, attrsD):
1093        self.push('license', 1)
1094        value = self._getAttribute(attrsD, 'rdf:resource')
1095        if value:
1096            self.elementstack[-1][2].append(value)
1097        self.pop('license')
1098       
1099    def _start_creativecommons_license(self, attrsD):
1100        self.push('license', 1)
1101
1102    def _end_creativecommons_license(self):
1103        self.pop('license')
1104
1105    def _addTag(self, term, scheme, label):
1106        context = self._getContext()
1107        tags = context.setdefault('tags', [])
1108        if (not term) and (not scheme) and (not label): return
1109        value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1110        if value not in tags:
1111            tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))
1112
1113    def _start_category(self, attrsD):
1114        if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
1115        term = attrsD.get('term')
1116        scheme = attrsD.get('scheme', attrsD.get('domain'))
1117        label = attrsD.get('label')
1118        self._addTag(term, scheme, label)
1119        self.push('category', 1)
1120    _start_dc_subject = _start_category
1121    _start_keywords = _start_category
1122       
1123    def _end_itunes_keywords(self):
1124        for term in self.pop('itunes_keywords').split():
1125            self._addTag(term, 'http://www.itunes.com/', None)
1126       
1127    def _start_itunes_category(self, attrsD):
1128        self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
1129        self.push('category', 1)
1130       
1131    def _end_category(self):
1132        value = self.pop('category')
1133        if not value: return
1134        context = self._getContext()
1135        tags = context['tags']
1136        if value and len(tags) and not tags[-1]['term']:
1137            tags[-1]['term'] = value
1138        else:
1139            self._addTag(value, None, None)
1140    _end_dc_subject = _end_category
1141    _end_keywords = _end_category
1142    _end_itunes_category = _end_category
1143
1144    def _start_cloud(self, attrsD):
1145        self._getContext()['cloud'] = FeedParserDict(attrsD)
1146       
1147    def _start_link(self, attrsD):
1148        attrsD.setdefault('rel', 'alternate')
1149        attrsD.setdefault('type', 'text/html')
1150        attrsD = self._itsAnHrefDamnIt(attrsD)
1151        if attrsD.has_key('href'):
1152            attrsD['href'] = self.resolveURI(attrsD['href'])
1153        expectingText = self.infeed or self.inentry or self.insource
1154        context = self._getContext()
1155        context.setdefault('links', [])
1156        context['links'].append(FeedParserDict(attrsD))
1157        if attrsD['rel'] == 'enclosure':
1158            self._start_enclosure(attrsD)
1159        if attrsD.has_key('href'):
1160            expectingText = 0
1161            if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1162                context['link'] = attrsD['href']
1163        else:
1164            self.push('link', expectingText)
1165    _start_producturl = _start_link
1166
1167    def _end_link(self):
1168        value = self.pop('link')
1169        context = self._getContext()
1170        if self.intextinput:
1171            context['textinput']['link'] = value
1172        if self.inimage:
1173            context['image']['link'] = value
1174    _end_producturl = _end_link
1175
1176    def _start_guid(self, attrsD):
1177        self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1178        self.push('id', 1)
1179
1180    def _end_guid(self):
1181        value = self.pop('id')
1182        self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1183        if self.guidislink:
1184            # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1185            # and only if the item doesn't already have a link element
1186            self._save('link', value)
1187
1188    def _start_title(self, attrsD):
1189        self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1190    _start_dc_title = _start_title
1191    _start_media_title = _start_title
1192
1193    def _end_title(self):
1194        value = self.popContent('title')
1195        context = self._getContext()
1196        if self.intextinput:
1197            context['textinput']['title'] = value
1198        elif self.inimage:
1199            context['image']['title'] = value
1200    _end_dc_title = _end_title
1201    _end_media_title = _end_title
1202
1203    def _start_description(self, attrsD):
1204        context = self._getContext()
1205        if context.has_key('summary'):
1206            self._summaryKey = 'content'
1207            self._start_content(attrsD)
1208        else:
1209            self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
1210
1211    def _start_abstract(self, attrsD):
1212        self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1213
1214    def _end_description(self):
1215        if self._summaryKey == 'content':
1216            self._end_content()
1217        else:
1218            value = self.popContent('description')
1219            context = self._getContext()
1220            if self.intextinput:
1221                context['textinput']['description'] = value
1222            elif self.inimage:
1223                context['image']['description'] = value
1224        self._summaryKey = None
1225    _end_abstract = _end_description
1226
1227    def _start_info(self, attrsD):
1228        self.pushContent('info', attrsD, 'text/plain', 1)
1229    _start_feedburner_browserfriendly = _start_info
1230
1231    def _end_info(self):
1232        self.popContent('info')
1233    _end_feedburner_browserfriendly = _end_info
1234
1235    def _start_generator(self, attrsD):
1236        if attrsD:
1237            attrsD = self._itsAnHrefDamnIt(attrsD)
1238            if attrsD.has_key('href'):
1239                attrsD['href'] = self.resolveURI(attrsD['href'])
1240        self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1241        self.push('generator', 1)
1242
1243    def _end_generator(self):
1244        value = self.pop('generator')
1245        context = self._getContext()
1246        if context.has_key('generator_detail'):
1247            context['generator_detail']['name'] = value
1248           
1249    def _start_admin_generatoragent(self, attrsD):
1250        self.push('generator', 1)
1251        value = self._getAttribute(attrsD, 'rdf:resource')
1252        if value:
1253            self.elementstack[-1][2].append(value)
1254        self.pop('generator')
1255        self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1256
1257    def _start_admin_errorreportsto(self, attrsD):
1258        self.push('errorreportsto', 1)
1259        value = self._getAttribute(attrsD, 'rdf:resource')
1260        if value:
1261            self.elementstack[-1][2].append(value)
1262        self.pop('errorreportsto')
1263       
1264    def _start_summary(self, attrsD):
1265        context = self._getContext()
1266        if context.has_key('summary'):
1267            self._summaryKey = 'content'
1268            self._start_content(attrsD)
1269        else:
1270            self._summaryKey = 'summary'
1271            self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
1272    _start_itunes_summary = _start_summary
1273
1274    def _end_summary(self):
1275        if self._summaryKey == 'content':
1276            self._end_content()
1277        else:
1278            self.popContent(self._summaryKey or 'summary')
1279        self._summaryKey = None
1280    _end_itunes_summary = _end_summary
1281       
1282    def _start_enclosure(self, attrsD):
1283        attrsD = self._itsAnHrefDamnIt(attrsD)
1284        self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))
1285        href = attrsD.get('href')
1286        if href:
1287            context = self._getContext()
1288            if not context.get('id'):
1289                context['id'] = href
1290           
1291    def _start_source(self, attrsD):
1292        self.insource = 1
1293
1294    def _end_source(self):
1295        self.insource = 0
1296        self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1297        self.sourcedata.clear()
1298
1299    def _start_content(self, attrsD):
1300        self.pushContent('content', attrsD, 'text/plain', 1)
1301        src = attrsD.get('src')
1302        if src:
1303            self.contentparams['src'] = src
1304        self.push('content', 1)
1305
1306    def _start_prodlink(self, attrsD):
1307        self.pushContent('content', attrsD, 'text/html', 1)
1308
1309    def _start_body(self, attrsD):
1310        self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
1311    _start_xhtml_body = _start_body
1312
1313    def _start_content_encoded(self, attrsD):
1314        self.pushContent('content', attrsD, 'text/html', 1)
1315    _start_fullitem = _start_content_encoded
1316
1317    def _end_content(self):
1318        copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
1319        value = self.popContent('content')
1320        if copyToDescription:
1321            self._save('description', value)
1322    _end_body = _end_content
1323    _end_xhtml_body = _end_content
1324    _end_content_encoded = _end_content
1325    _end_fullitem = _end_content
1326    _end_prodlink = _end_content
1327
1328    def _start_itunes_image(self, attrsD):
1329        self.push('itunes_image', 0)
1330        self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1331    _start_itunes_link = _start_itunes_image
1332       
1333    def _end_itunes_block(self):
1334        value = self.pop('itunes_block', 0)
1335        self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1336
1337    def _end_itunes_explicit(self):
1338        value = self.pop('itunes_explicit', 0)
1339        self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
1340
1341if _XML_AVAILABLE:
1342    class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1343        def __init__(self, baseuri, baselang, encoding):
1344            if _debug: sys.stderr.write('trying StrictFeedParser\n')
1345            xml.sax.handler.ContentHandler.__init__(self)
1346            _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1347            self.bozo = 0
1348            self.exc = None
1349       
1350        def startPrefixMapping(self, prefix, uri):
1351            self.trackNamespace(prefix, uri)
1352       
1353        def startElementNS(self, name, qname, attrs):
1354            namespace, localname = name
1355            lowernamespace = str(namespace or '').lower()
1356            if lowernamespace.find('backend.userland.com/rss') <> -1:
1357                # match any backend.userland.com namespace
1358                namespace = 'http://backend.userland.com/rss'
1359                lowernamespace = namespace
1360            if qname and qname.find(':') > 0:
1361                givenprefix = qname.split(':')[0]
1362            else:
1363                givenprefix = None
1364            prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1365            if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1366                    raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1367            if prefix:
1368                localname = prefix + ':' + localname
1369            localname = str(localname).lower()
1370            if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
1371
1372            # qname implementation is horribly broken in Python 2.1 (it
1373            # doesn't report any), and slightly broken in Python 2.2 (it
1374            # doesn't report the xml: namespace). So we match up namespaces
1375            # with a known list first, and then possibly override them with
1376            # the qnames the SAX parser gives us (if indeed it gives us any
1377            # at all).  Thanks to MatejC for helping me test this and
1378            # tirelessly telling me that it didn't work yet.
1379            attrsD = {}
1380            for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1381                lowernamespace = (namespace or '').lower()
1382                prefix = self._matchnamespaces.get(lowernamespace, '')
1383                if prefix:
1384                    attrlocalname = prefix + ':' + attrlocalname
1385                attrsD[str(attrlocalname).lower()] = attrvalue
1386            for qname in attrs.getQNames():
1387                attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1388            self.unknown_starttag(localname, attrsD.items())
1389
1390        def characters(self, text):
1391            self.handle_data(text)
1392
1393        def endElementNS(self, name, qname):
1394            namespace, localname = name
1395            lowernamespace = str(namespace or '').lower()
1396            if qname and qname.find(':') > 0:
1397                givenprefix = qname.split(':')[0]
1398            else:
1399                givenprefix = ''
1400            prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1401            if prefix:
1402                localname = prefix + ':' + localname
1403            localname = str(localname).lower()
1404            self.unknown_endtag(localname)
1405
1406        def error(self, exc):
1407            self.bozo = 1
1408            self.exc = exc
1409           
1410        def fatalError(self, exc):
1411            self.error(exc)
1412            raise exc
1413
1414class _BaseHTMLProcessor(sgmllib.SGMLParser):
1415    elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1416      'img', 'input', 'isindex', 'link', 'meta', 'param']
1417   
1418    def __init__(self, encoding):
1419        self.encoding = encoding
1420        if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1421        sgmllib.SGMLParser.__init__(self)
1422       
1423    def reset(self):
1424        self.pieces = []
1425        sgmllib.SGMLParser.reset(self)
1426
1427    def _shorttag_replace(self, match):
1428        tag = match.group(1)
1429        if tag in self.elements_no_end_tag:
1430            return '<' + tag + ' />'
1431        else:
1432            return '<' + tag + '></' + tag + '>'
1433       
1434    def feed(self, data):
1435        data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
1436        #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1437        data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) 
1438        data = data.replace('&#39;', "'")
1439        data = data.replace('&#34;', '"')
1440        if self.encoding and type(data) == type(u''):
1441            data = data.encode(self.encoding)
1442        sgmllib.SGMLParser.feed(self, data)
1443
1444    def normalize_attrs(self, attrs):
1445        # utility method to be called by descendants
1446        attrs = [(k.lower(), v) for k, v in attrs]
1447        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1448        return attrs
1449
1450    def unknown_starttag(self, tag, attrs):
1451        # called for each start tag
1452        # attrs is a list of (attr, value) tuples
1453        # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1454        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1455        uattrs = []
1456        # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1457        for key, value in attrs:
1458            if type(value) != type(u''):
1459                value = unicode(value, self.encoding)
1460            uattrs.append((unicode(key, self.encoding), value))
1461        strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
1462        if tag in self.elements_no_end_tag:
1463            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1464        else:
1465            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1466
1467    def unknown_endtag(self, tag):
1468        # called for each end tag, e.g. for </pre>, tag will be 'pre'
1469        # Reconstruct the original end tag.
1470        if tag not in self.elements_no_end_tag:
1471            self.pieces.append("</%(tag)s>" % locals())
1472
1473    def handle_charref(self, ref):
1474        # called for each character reference, e.g. for '&#160;', ref will be '160'
1475        # Reconstruct the original character reference.
1476        self.pieces.append('&#%(ref)s;' % locals())
1477       
1478    def handle_entityref(self, ref):
1479        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
1480        # Reconstruct the original entity reference.
1481        self.pieces.append('&%(ref)s;' % locals())
1482
1483    def handle_data(self, text):
1484        # called for each block of plain text, i.e. outside of any tag and
1485        # not containing any character or entity references
1486        # Store the original text verbatim.
1487        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
1488        self.pieces.append(text)
1489       
1490    def handle_comment(self, text):
1491        # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1492        # Reconstruct the original comment.
1493        self.pieces.append('<!--%(text)s-->' % locals())
1494       
1495    def handle_pi(self, text):
1496        # called for each processing instruction, e.g. <?instruction>
1497        # Reconstruct original processing instruction.
1498        self.pieces.append('<?%(text)s>' % locals())
1499
1500    def handle_decl(self, text):
1501        # called for the DOCTYPE, if present, e.g.
1502        # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1503        #     "http://www.w3.org/TR/html4/loose.dtd">
1504        # Reconstruct original DOCTYPE
1505        self.pieces.append('<!%(text)s>' % locals())
1506       
1507    _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1508    def _scan_name(self, i, declstartpos):
1509        rawdata = self.rawdata
1510        n = len(rawdata)
1511        if i == n:
1512            return None, -1
1513        m = self._new_declname_match(rawdata, i)
1514        if m:
1515            s = m.group()
1516            name = s.strip()
1517            if (i + len(s)) == n:
1518                return None, -1  # end of buffer
1519            return name.lower(), m.end()
1520        else:
1521            self.handle_data(rawdata)
1522#            self.updatepos(declstartpos, i)
1523            return None, -1
1524
1525    def output(self):
1526        '''Return processed HTML as a single string'''
1527        return ''.join([str(p) for p in self.pieces])
1528
1529class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1530    def __init__(self, baseuri, baselang, encoding):
1531        sgmllib.SGMLParser.__init__(self)
1532        _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1533
1534    def decodeEntities(self, element, data):
1535        data = data.replace('&#60;', '&lt;')
1536        data = data.replace('&#x3c;', '&lt;')
1537        data = data.replace('&#62;', '&gt;')
1538        data = data.replace('&#x3e;', '&gt;')
1539        data = data.replace('&#38;', '&amp;')
1540        data = data.replace('&#x26;', '&amp;')
1541        data = data.replace('&#34;', '&quot;')
1542        data = data.replace('&#x22;', '&quot;')
1543        data = data.replace('&#39;', '&apos;')
1544        data = data.replace('&#x27;', '&apos;')
1545        if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
1546            data = data.replace('&lt;', '<')
1547            data = data.replace('&gt;', '>')
1548            data = data.replace('&amp;', '&')
1549            data = data.replace('&quot;', '"')
1550            data = data.replace('&apos;', "'")
1551        return data
1552       
1553class _RelativeURIResolver(_BaseHTMLProcessor):
1554    relative_uris = [('a', 'href'),
1555                     ('applet', 'codebase'),
1556                     ('area', 'href'),
1557                     ('blockquote', 'cite'),
1558                     ('body', 'background'),
1559                     ('del', 'cite'),
1560                     ('form', 'action'),
1561                     ('frame', 'longdesc'),
1562                     ('frame', 'src'),
1563                     ('iframe', 'longdesc'),
1564                     ('iframe', 'src'),
1565                     ('head', 'profile'),
1566                     ('img', 'longdesc'),
1567                     ('img', 'src'),
1568                     ('img', 'usemap'),
1569                     ('input', 'src'),
1570                     ('input', 'usemap'),
1571                     ('ins', 'cite'),
1572                     ('link', 'href'),
1573                     ('object', 'classid'),
1574                     ('object', 'codebase'),
1575                     ('object', 'data'),
1576                     ('object', 'usemap'),
1577                     ('q', 'cite'),
1578                     ('script', 'src')]
1579
1580    def __init__(self, baseuri, encoding):
1581        _BaseHTMLProcessor.__init__(self, encoding)
1582        self.baseuri = baseuri
1583
1584    def resolveURI(self, uri):
1585        return _urljoin(self.baseuri, uri)
1586   
1587    def unknown_starttag(self, tag, attrs):
1588        attrs = self.normalize_attrs(attrs)
1589        attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
1590        _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1591       
1592def _resolveRelativeURIs(htmlSource, baseURI, encoding):
1593    if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
1594    p = _RelativeURIResolver(baseURI, encoding)
1595    p.feed(htmlSource)
1596    return p.output()
1597
1598class _HTMLSanitizer(_BaseHTMLProcessor):
1599    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
1600      'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
1601      'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
1602      'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
1603      'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
1604      'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
1605      'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
1606      'thead', 'tr', 'tt', 'u', 'ul', 'var']
1607
1608    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
1609      'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
1610      'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
1611      'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
1612      'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
1613      'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
1614      'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
1615      'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
1616      'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
1617      'usemap', 'valign', 'value', 'vspace', 'width']
1618
1619    unacceptable_elements_with_end_tag = ['script', 'applet']
1620
1621    def reset(self):
1622        _BaseHTMLProcessor.reset(self)
1623        self.unacceptablestack = 0
1624       
1625    def unknown_starttag(self, tag, attrs):
1626        if not tag in self.acceptable_elements:
1627            if tag in self.unacceptable_elements_with_end_tag:
1628                self.unacceptablestack += 1
1629            return
1630        attrs = self.normalize_attrs(attrs)
1631        attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
1632        _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1633       
1634    def unknown_endtag(self, tag):
1635        if not tag in self.acceptable_elements:
1636            if tag in self.unacceptable_elements_with_end_tag:
1637                self.unacceptablestack -= 1
1638            return
1639        _BaseHTMLProcessor.unknown_endtag(self, tag)
1640
1641    def handle_pi(self, text):
1642        pass
1643
1644    def handle_decl(self, text):
1645        pass
1646
1647    def handle_data(self, text):
1648        if not self.unacceptablestack:
1649            _BaseHTMLProcessor.handle_data(self, text)
1650
1651def _sanitizeHTML(htmlSource, encoding):
1652    p = _HTMLSanitizer(encoding)
1653    p.feed(htmlSource)
1654    data = p.output()
1655    if TIDY_MARKUP:
1656        # loop through list of preferred Tidy interfaces looking for one that's installed,
1657        # then set up a common _tidy function to wrap the interface-specific API.
1658        _tidy = None
1659        for tidy_interface in PREFERRED_TIDY_INTERFACES:
1660            try:
1661                if tidy_interface == "uTidy":
1662                    from tidy import parseString as _utidy
1663                    def _tidy(data, **kwargs):
1664                        return str(_utidy(data, **kwargs))
1665                    break
1666                elif tidy_interface == "mxTidy":
1667                    from mx.Tidy import Tidy as _mxtidy
1668                    def _tidy(data, **kwargs):
1669                        nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
1670                        return data
1671                    break
1672            except:
1673                pass
1674        if _tidy:
1675            utf8 = type(data) == type(u'')
1676            if utf8:
1677                data = data.encode('utf-8')
1678            data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
1679            if utf8:
1680                data = unicode(data, 'utf-8')
1681            if data.count('<body'):
1682                data = data.split('<body', 1)[1]
1683                if data.count('>'):
1684                    data = data.split('>', 1)[1]
1685            if data.count('</body'):
1686                data = data.split('</body', 1)[0]
1687    data = data.strip().replace('\r\n', '\n')
1688    return data
1689
1690class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
1691    def http_error_default(self, req, fp, code, msg, headers):
1692        if ((code / 100) == 3) and (code != 304):
1693            return self.http_error_302(req, fp, code, msg, headers)
1694        infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1695        infourl.status = code
1696        return infourl
1697
1698    def http_error_302(self, req, fp, code, msg, headers):
1699        if headers.dict.has_key('location'):
1700            infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
1701        else:
1702            infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1703        if not hasattr(infourl, 'status'):
1704            infourl.status = code
1705        return infourl
1706
1707    def http_error_301(self, req, fp, code, msg, headers):
1708        if headers.dict.has_key('location'):
1709            infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
1710        else:
1711            infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1712        if not hasattr(infourl, 'status'):
1713            infourl.status = code
1714        return infourl
1715
1716    http_error_300 = http_error_302
1717    http_error_303 = http_error_302
1718    http_error_307 = http_error_302
1719       
1720    def http_error_401(self, req, fp, code, msg, headers):
1721        # Check if
1722        # - server requires digest auth, AND
1723        # - we tried (unsuccessfully) with basic auth, AND
1724        # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
1725        # If all conditions hold, parse authentication information
1726        # out of the Authorization header we sent the first time
1727        # (for the username and password) and the WWW-Authenticate
1728        # header the server sent back (for the realm) and retry
1729        # the request with the appropriate digest auth headers instead.
1730        # This evil genius hack has been brought to you by Aaron Swartz.
1731        host = urlparse.urlparse(req.get_full_url())[1]
1732        try:
1733            assert sys.version.split()[0] >= '2.3.3'
1734            assert base64 != None
1735            user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
1736            realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
1737            self.add_password(realm, host, user, passw)
1738            retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
1739            self.reset_retry_count()
1740            return retry
1741        except:
1742            return self.http_error_default(req, fp, code, msg, headers)
1743
1744def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
1745    """URL, filename, or string --> stream
1746
1747    This function lets you define parsers that take any input source
1748    (URL, pathname to local or network file, or actual data as a string)
1749    and deal with it in a uniform manner.  Returned object is guaranteed
1750    to have all the basic stdio read methods (read, readline, readlines).
1751    Just .close() the object when you're done with it.
1752
1753    If the etag argument is supplied, it will be used as the value of an
1754    If-None-Match request header.
1755
1756    If the modified argument is supplied, it must be a tuple of 9 integers
1757    as returned by gmtime() in the standard Python time module. This MUST
1758    be in GMT (Greenwich Mean Time). The formatted date/time will be used
1759    as the value of an If-Modified-Since request header.
1760
1761    If the agent argument is supplied, it will be used as the value of a
1762    User-Agent request header.
1763
1764    If the referrer argument is supplied, it will be used as the value of a
1765    Referer[sic] request header.
1766
1767    If handlers is supplied, it is a list of handlers used to build a
1768    urllib2 opener.
1769    """
1770
1771    if hasattr(url_file_stream_or_string, 'read'):
1772        return url_file_stream_or_string
1773
1774    if url_file_stream_or_string == '-':
1775        return sys.stdin
1776
1777    if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
1778        if not agent:
1779            agent = USER_AGENT
1780        # test for inline user:password for basic auth
1781        auth = None
1782        if base64:
1783            urltype, rest = urllib.splittype(url_file_stream_or_string)
1784            realhost, rest = urllib.splithost(rest)
1785            if realhost:
1786                user_passwd, realhost = urllib.splituser(realhost)
1787                if user_passwd:
1788                    url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
1789                    auth = base64.encodestring(user_passwd).strip()
1790        # try to open with urllib2 (to use optional headers)
1791        request = urllib2.Request(url_file_stream_or_string)
1792        request.add_header('User-Agent', agent)
1793        if etag:
1794            request.add_header('If-None-Match', etag)
1795        if modified:
1796            # format into an RFC 1123-compliant timestamp. We can't use
1797            # time.strftime() since the %a and %b directives can be affected
1798            # by the current locale, but RFC 2616 states that dates must be
1799            # in English.
1800            short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
1801            months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
1802            request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
1803        if referrer:
1804            request.add_header('Referer', referrer)
1805        if gzip and zlib:
1806            request.add_header('Accept-encoding', 'gzip, deflate')
1807        elif gzip:
1808            request.add_header('Accept-encoding', 'gzip')
1809        elif zlib:
1810            request.add_header('Accept-encoding', 'deflate')
1811        else:
1812            request.add_header('Accept-encoding', '')
1813        if auth:
1814            request.add_header('Authorization', 'Basic %s' % auth)
1815        if ACCEPT_HEADER:
1816            request.add_header('Accept', ACCEPT_HEADER)
1817        request.add_header('A-IM', 'feed') # RFC 3229 support
1818        opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
1819        opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
1820        try:
1821            return opener.open(request)
1822        finally:
1823            opener.close() # JohnD
1824   
1825    # try to open with native open function (if url_file_stream_or_string is a filename)
1826    try:
1827        return open(url_file_stream_or_string)
1828    except:
1829        pass
1830
1831    # treat url_file_stream_or_string as string
1832    return _StringIO(str(url_file_stream_or_string))
1833
1834_date_handlers = []
1835def registerDateHandler(func):
1836    '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
1837    _date_handlers.insert(0, func)
1838   
1839# ISO-8601 date parsing routines written by Fazal Majid.
1840# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
1841# parser is beyond the scope of feedparser and would be a worthwhile addition
1842# to the Python library.
1843# A single regular expression cannot parse ISO 8601 date formats into groups
1844# as the standard is highly irregular (for instance is 030104 2003-01-04 or
1845# 0301-04-01), so we use templates instead.
1846# Please note the order in templates is significant because we need a
1847# greedy match.
1848_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
1849                'YY-?MM-?DD', 'YY-?OOO', 'YYYY', 
1850                '-YY-?MM', '-OOO', '-YY',
1851                '--MM-?DD', '--MM',
1852                '---DD',
1853                'CC', '']
1854_iso8601_re = [
1855    tmpl.replace(
1856    'YYYY', r'(?P<year>\d{4})').replace(
1857    'YY', r'(?P<year>\d\d)').replace(
1858    'MM', r'(?P<month>[01]\d)').replace(
1859    'DD', r'(?P<day>[0123]\d)').replace(
1860    'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
1861    'CC', r'(?P<century>\d\d$)')
1862    + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
1863    + r'(:(?P<second>\d{2}))?'
1864    + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
1865    for tmpl in _iso8601_tmpl]
1866del tmpl
1867_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
1868del regex
1869def _parse_date_iso8601(dateString):
1870    '''Parse a variety of ISO-8601-compatible formats like 20040105'''
1871    m = None
1872    for _iso8601_match in _iso8601_matches:
1873        m = _iso8601_match(dateString)
1874        if m: break
1875    if not m: return
1876    if m.span() == (0, 0): return
1877    params = m.groupdict()
1878    ordinal = params.get('ordinal', 0)
1879    if ordinal:
1880        ordinal = int(ordinal)
1881    else:
1882        ordinal = 0
1883    year = params.get('year', '--')
1884    if not year or year == '--':
1885        year = time.gmtime()[0]
1886    elif len(year) == 2:
1887        # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
1888        year = 100 * int(time.gmtime()[0] / 100) + int(year)
1889    else:
1890        year = int(year)
1891    month = params.get('month', '-')
1892    if not month or month == '-':
1893        # ordinals are NOT normalized by mktime, we simulate them
1894        # by setting month=1, day=ordinal
1895        if ordinal:
1896            month = 1
1897        else:
1898            month = time.gmtime()[1]
1899    month = int(month)
1900    day = params.get('day', 0)
1901    if not day:
1902        # see above
1903        if ordinal:
1904            day = ordinal
1905        elif params.get('century', 0) or \
1906                 params.get('year', 0) or params.get('month', 0):
1907            day = 1
1908        else:
1909            day = time.gmtime()[2]
1910    else:
1911        day = int(day)
1912    # special case of the century - is the first year of the 21st century
1913    # 2000 or 2001 ? The debate goes on...
1914    if 'century' in params.keys():
1915        year = (int(params['century']) - 1) * 100 + 1
1916    # in ISO 8601 most fields are optional
1917    for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
1918        if not params.get(field, None):
1919            params[field] = 0
1920    hour = int(params.get('hour', 0))
1921    minute = int(params.get('minute', 0))
1922    second = int(params.get('second', 0))
1923    # weekday is normalized by mktime(), we can ignore it
1924    weekday = 0
1925    # daylight savings is complex, but not needed for feedparser's purposes
1926    # as time zones, if specified, include mention of whether it is active
1927    # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
1928    # and most implementations have DST bugs
1929    daylight_savings_flag = 0
1930    tm = [year, month, day, hour, minute, second, weekday,
1931          ordinal, daylight_savings_flag]
1932    # ISO 8601 time zone adjustments
1933    tz = params.get('tz')
1934    if tz and tz != 'Z':
1935        if tz[0] == '-':
1936            tm[3] += int(params.get('tzhour', 0))
1937            tm[4] += int(params.get('tzmin', 0))
1938        elif tz[0] == '+':
1939            tm[3] -= int(params.get('tzhour', 0))
1940            tm[4] -= int(params.get('tzmin', 0))
1941        else:
1942            return None
1943    # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
1944    # which is guaranteed to normalize d/m/y/h/m/s.
1945    # Many implementations have bugs, but we'll pretend they don't.
1946    return time.localtime(time.mktime(tm))
1947registerDateHandler(_parse_date_iso8601)
1948   
1949# 8-bit date handling routines written by ytrewq1.
1950_korean_year  = u'\ub144' # b3e2 in euc-kr
1951_korean_month = u'\uc6d4' # bff9 in euc-kr
1952_korean_day   = u'\uc77c' # c0cf in euc-kr
1953_korean_am    = u'\uc624\uc804' # bfc0 c0fc in euc-kr
1954_korean_pm    = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
1955
1956_korean_onblog_date_re = \
1957    re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
1958               (_korean_year, _korean_month, _korean_day))
1959_korean_nate_date_re = \
1960    re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
1961               (_korean_am, _korean_pm))
1962def _parse_date_onblog(dateString):
1963    '''Parse a string according to the OnBlog 8-bit date format'''
1964    m = _korean_onblog_date_re.match(dateString)
1965    if not m: return
1966    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1967                {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1968                 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
1969                 'zonediff': '+09:00'}
1970    if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
1971    return _parse_date_w3dtf(w3dtfdate)
1972registerDateHandler(_parse_date_onblog)
1973
1974def _parse_date_nate(dateString):
1975    '''Parse a string according to the Nate 8-bit date format'''
1976    m = _korean_nate_date_re.match(dateString)
1977    if not m: return
1978    hour = int(m.group(5))
1979    ampm = m.group(4)
1980    if (ampm == _korean_pm):
1981        hour += 12
1982    hour = str(hour)
1983    if len(hour) == 1:
1984        hour = '0' + hour
1985    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1986                {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1987                 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
1988                 'zonediff': '+09:00'}
1989    if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
1990    return _parse_date_w3dtf(w3dtfdate)
1991registerDateHandler(_parse_date_nate)
1992
1993_mssql_date_re = \
1994    re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
1995def _parse_date_mssql(dateString):
1996    '''Parse a string according to the MS SQL date format'''
1997    m = _mssql_date_re.match(dateString)
1998    if not m: return
1999    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
2000                {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2001                 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
2002                 'zonediff': '+09:00'}
2003    if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
2004    return _parse_date_w3dtf(w3dtfdate)
2005registerDateHandler(_parse_date_mssql)
2006
2007# Unicode strings for Greek date strings
2008_greek_months = \
2009  { \
2010   u'\u0399\u03b1\u03bd': u'Jan',       # c9e1ed in iso-8859-7
2011   u'\u03a6\u03b5\u03b2': u'Feb',       # d6e5e2 in iso-8859-7
2012   u'\u039c\u03ac\u03ce': u'Mar',       # ccdcfe in iso-8859-7
2013   u'\u039c\u03b1\u03ce': u'Mar',       # cce1fe in iso-8859-7
2014   u'\u0391\u03c0\u03c1': u'Apr',       # c1f0f1 in iso-8859-7
2015   u'\u039c\u03ac\u03b9': u'May',       # ccdce9 in iso-8859-7
2016   u'\u039c\u03b1\u03ca': u'May',       # cce1fa in iso-8859-7
2017   u'\u039c\u03b1\u03b9': u'May',       # cce1e9 in iso-8859-7
2018   u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
2019   u'\u0399\u03bf\u03bd': u'Jun',       # c9efed in iso-8859-7
2020   u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
2021   u'\u0399\u03bf\u03bb': u'Jul',       # c9f9eb in iso-8859-7
2022   u'\u0391\u03cd\u03b3': u'Aug',       # c1fde3 in iso-8859-7
2023   u'\u0391\u03c5\u03b3': u'Aug',       # c1f5e3 in iso-8859-7
2024   u'\u03a3\u03b5\u03c0': u'Sep',       # d3e5f0 in iso-8859-7
2025   u'\u039f\u03ba\u03c4': u'Oct',       # cfeaf4 in iso-8859-7
2026   u'\u039d\u03bf\u03ad': u'Nov',       # cdefdd in iso-8859-7
2027   u'\u039d\u03bf\u03b5': u'Nov',       # cdefe5 in iso-8859-7
2028   u'\u0394\u03b5\u03ba': u'Dec',       # c4e5ea in iso-8859-7
2029  }
2030
2031_greek_wdays = \
2032  { \
2033   u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
2034   u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
2035   u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
2036   u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
2037   u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
2038   u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
2039   u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7   
2040  }
2041
2042_greek_date_format_re = \
2043    re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
2044
2045def _parse_date_greek(dateString):
2046    '''Parse a string according to a Greek 8-bit date format.'''
2047    m = _greek_date_format_re.match(dateString)
2048    if not m: return
2049    try:
2050        wday = _greek_wdays[m.group(1)]
2051        month = _greek_months[m.group(3)]
2052    except:
2053        return
2054    rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
2055                 {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
2056                  'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
2057                  'zonediff': m.group(8)}
2058    if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
2059    return _parse_date_rfc822(rfc822date)
2060registerDateHandler(_parse_date_greek)
2061
2062# Unicode strings for Hungarian date strings
2063_hungarian_months = \
2064  { \
2065    u'janu\u00e1r':   u'01',  # e1 in iso-8859-2
2066    u'febru\u00e1ri': u'02',  # e1 in iso-8859-2
2067    u'm\u00e1rcius':  u'03',  # e1 in iso-8859-2
2068    u'\u00e1prilis':  u'04',  # e1 in iso-8859-2
2069    u'm\u00e1ujus':   u'05',  # e1 in iso-8859-2
2070    u'j\u00fanius':   u'06',  # fa in iso-8859-2
2071    u'j\u00falius':   u'07',  # fa in iso-8859-2
2072    u'augusztus':     u'08',
2073    u'szeptember':    u'09',
2074    u'okt\u00f3ber':  u'10',  # f3 in iso-8859-2
2075    u'november':      u'11',
2076    u'december':      u'12',
2077  }
2078
2079_hungarian_date_format_re = \
2080  re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
2081
2082def _parse_date_hungarian(dateString):
2083    '''Parse a string according to a Hungarian 8-bit date format.'''
2084    m = _hungarian_date_format_re.match(dateString)
2085    if not m: return
2086    try:
2087        month = _hungarian_months[m.group(2)]
2088        day = m.group(3)
2089        if len(day) == 1:
2090            day = '0' + day
2091        hour = m.group(4)
2092        if len(hour) == 1:
2093            hour = '0' + hour
2094    except:
2095        return
2096    w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
2097                {'year': m.group(1), 'month': month, 'day': day,\
2098                 'hour': hour, 'minute': m.group(5),\
2099                 'zonediff': m.group(6)}
2100    if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
2101    return _parse_date_w3dtf(w3dtfdate)
2102registerDateHandler(_parse_date_hungarian)
2103
2104# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
2105# Drake and licensed under the Python license.  Removed all range checking
2106# for month, day, hour, minute, and second, since mktime will normalize
2107# these later
2108def _parse_date_w3dtf(dateString):
2109    def __extract_date(m):
2110        year = int(m.group('year'))
2111        if year < 100:
2112            year = 100 * int(time.gmtime()[0] / 100) + int(year)
2113        if year < 1000:
2114            return 0, 0, 0
2115        julian = m.group('julian')
2116        if julian:
2117            julian = int(julian)
2118            month = julian / 30 + 1
2119            day = julian % 30 + 1
2120            jday = None
2121            while jday != julian:
2122                t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
2123                jday = time.gmtime(t)[-2]
2124                diff = abs(jday - julian)
2125                if jday > julian:
2126                    if diff < day:
2127                        day = day - diff
2128                    else:
2129                        month = month - 1
2130                        day = 31
2131                elif jday < julian:
2132                    if day + diff < 28:
2133                       day = day + diff
2134                    else:
2135                        month = month + 1
2136            return year, month, day
2137        month = m.group('month')
2138        day = 1
2139        if month is None:
2140            month = 1
2141        else:
2142            month = int(month)
2143            day = m.group('day')
2144            if day:
2145                day = int(day)
2146            else:
2147                day = 1
2148        return year, month, day
2149
2150    def __extract_time(m):
2151        if not m:
2152            return 0, 0, 0
2153        hours = m.group('hours')
2154        if not hours:
2155            return 0, 0, 0
2156        hours = int(hours)
2157        minutes = int(m.group('minutes'))
2158        seconds = m.group('seconds')
2159        if seconds:
2160            seconds = int(seconds)
2161        else:
2162            seconds = 0
2163        return hours, minutes, seconds
2164
2165    def __extract_tzd(m):
2166        '''Return the Time Zone Designator as an offset in seconds from UTC.'''
2167        if not m:
2168            return 0
2169        tzd = m.group('tzd')
2170        if not tzd:
2171            return 0
2172        if tzd == 'Z':
2173            return 0
2174        hours = int(m.group('tzdhours'))
2175        minutes = m.group('tzdminutes')
2176        if minutes:
2177            minutes = int(minutes)
2178        else:
2179            minutes = 0
2180        offset = (hours*60 + minutes) * 60
2181        if tzd[0] == '+':
2182            return -offset
2183        return offset
2184
2185    __date_re = ('(?P<year>\d\d\d\d)'
2186                 '(?:(?P<dsep>-|)'
2187                 '(?:(?P<julian>\d\d\d)'
2188                 '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')
2189    __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
2190    __tzd_rx = re.compile(__tzd_re)
2191    __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
2192                 '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'
2193                 + __tzd_re)
2194    __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
2195    __datetime_rx = re.compile(__datetime_re)
2196    m = __datetime_rx.match(dateString)
2197    if (m is None) or (m.group() != dateString): return
2198    gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
2199    if gmt[0] == 0: return
2200    return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
2201registerDateHandler(_parse_date_w3dtf)
2202
2203def _parse_date_rfc822(dateString):
2204    '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
2205    data = dateString.split()
2206    if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
2207        del data[0]
2208    if len(data) == 4:
2209        s = data[3]
2210        i = s.find('+')
2211        if i > 0:
2212            data[3:] = [s[:i], s[i+1:]]
2213        else:
2214            data.append('')
2215        dateString = " ".join(data)
2216    if len(data) < 5:
2217        dateString += ' 00:00:00 GMT'
2218    tm = rfc822.parsedate_tz(dateString)
2219    if tm:
2220        return time.gmtime(rfc822.mktime_tz(tm))
2221# rfc822.py defines several time zones, but we define some extra ones.
2222# 'ET' is equivalent to 'EST', etc.
2223_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
2224rfc822._timezones.update(_additional_timezones)
2225registerDateHandler(_parse_date_rfc822)   
2226
2227def _parse_date(dateString):
2228    '''Parses a variety of date formats into a 9-tuple in GMT'''
2229    for handler in _date_handlers:
2230        try:
2231            date9tuple = handler(dateString)
2232            if not date9tuple: continue
2233            if len(date9tuple) != 9:
2234                if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
2235                raise ValueError
2236            map(int, date9tuple)
2237            return date9tuple
2238        except Exception, e:
2239            if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
2240            pass
2241    return None
2242
2243def _getCharacterEncoding(http_headers, xml_data):
2244    '''Get the character encoding of the XML document
2245
2246    http_headers is a dictionary
2247    xml_data is a raw string (not Unicode)
2248   
2249    This is so much trickier than it sounds, it's not even funny.
2250    According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
2251    is application/xml, application/*+xml,
2252    application/xml-external-parsed-entity, or application/xml-dtd,
2253    the encoding given in the charset parameter of the HTTP Content-Type
2254    takes precedence over the encoding given in the XML prefix within the
2255    document, and defaults to 'utf-8' if neither are specified.  But, if
2256    the HTTP Content-Type is text/xml, text/*+xml, or
2257    text/xml-external-parsed-entity, the encoding given in the XML prefix
2258    within the document is ALWAYS IGNORED and only the encoding given in
2259    the charset parameter of the HTTP Content-Type header should be
2260    respected, and it defaults to 'us-ascii' if not specified.
2261
2262    Furthermore, discussion on the atom-syntax mailing list with the
2263    author of RFC 3023 leads me to the conclusion that any document
2264    served with a Content-Type of text/* and no charset parameter
2265    must be treated as us-ascii.  (We now do this.)  And also that it
2266    must always be flagged as non-well-formed.  (We now do this too.)
2267   
2268    If Content-Type is unspecified (input was local file or non-HTTP source)
2269    or unrecognized (server just got it totally wrong), then go by the
2270    encoding given in the XML prefix of the document and default to
2271    'iso-8859-1' as per the HTTP specification (RFC 2616).
2272   
2273    Then, assuming we didn't find a character encoding in the HTTP headers
2274    (and the HTTP Content-type allowed us to look in the body), we need
2275    to sniff the first few bytes of the XML data and try to determine
2276    whether the encoding is ASCII-compatible.  Section F of the XML
2277    specification shows the way here:
2278    http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2279
2280    If the sniffed encoding is not ASCII-compatible, we need to make it
2281    ASCII compatible so that we can sniff further into the XML declaration
2282    to find the encoding attribute, which will tell us the true encoding.
2283
2284    Of course, none of this guarantees that we will be able to parse the
2285    feed in the declared character encoding (assuming it was declared
2286    correctly, which many are not).  CJKCodecs and iconv_codec help a lot;
2287    you should definitely install them if you can.
2288    http://cjkpython.i18n.org/
2289    '''
2290
2291    def _parseHTTPContentType(content_type):
2292        '''takes HTTP Content-Type header and returns (content type, charset)
2293
2294        If no charset is specified, returns (content type, '')
2295        If no content type is specified, returns ('', '')
2296        Both return parameters are guaranteed to be lowercase strings
2297        '''
2298        content_type = content_type or ''
2299        content_type, params = cgi.parse_header(content_type)
2300        return content_type, params.get('charset', '').replace("'", '')
2301
2302    sniffed_xml_encoding = ''
2303    xml_encoding = ''
2304    true_encoding = ''
2305    http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
2306    # Must sniff for non-ASCII-compatible character encodings before
2307    # searching for XML declaration.  This heuristic is defined in
2308    # section F of the XML specification:
2309    # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2310    try:
2311        if xml_data[:4] == '\x4c\x6f\xa7\x94':
2312            # EBCDIC
2313            xml_data = _ebcdic_to_ascii(xml_data)
2314        elif xml_data[:4] == '\x00\x3c\x00\x3f':
2315            # UTF-16BE
2316            sniffed_xml_encoding = 'utf-16be'
2317            xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
2318        elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
2319            # UTF-16BE with BOM
2320            sniffed_xml_encoding = 'utf-16be'
2321            xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
2322        elif xml_data[:4] == '\x3c\x00\x3f\x00':
2323            # UTF-16LE
2324            sniffed_xml_encoding = 'utf-16le'
2325            xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
2326        elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
2327            # UTF-16LE with BOM
2328            sniffed_xml_encoding = 'utf-16le'
2329            xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
2330        elif xml_data[:4] == '\x00\x00\x00\x3c':
2331            # UTF-32BE
2332            sniffed_xml_encoding = 'utf-32be'
2333            xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
2334        elif xml_data[:4] == '\x3c\x00\x00\x00':
2335            # UTF-32LE
2336            sniffed_xml_encoding = 'utf-32le'
2337            xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
2338        elif xml_data[:4] == '\x00\x00\xfe\xff':
2339            # UTF-32BE with BOM
2340            sniffed_xml_encoding = 'utf-32be'
2341            xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
2342        elif xml_data[:4] == '\xff\xfe\x00\x00':
2343            # UTF-32LE with BOM
2344            sniffed_xml_encoding = 'utf-32le'
2345            xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
2346        elif xml_data[:3] == '\xef\xbb\xbf':
2347            # UTF-8 with BOM
2348            sniffed_xml_encoding = 'utf-8'
2349            xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
2350        else:
2351            # ASCII-compatible
2352            pass
2353        xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
2354    except:
2355        xml_encoding_match = None
2356    if xml_encoding_match:
2357        xml_encoding = xml_encoding_match.groups()[0].lower()
2358        if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
2359            xml_encoding = sniffed_xml_encoding
2360    acceptable_content_type = 0
2361    application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
2362    text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
2363    if (http_content_type in application_content_types) or \
2364       (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
2365        acceptable_content_type = 1
2366        true_encoding = http_encoding or xml_encoding or 'utf-8'
2367    elif (http_content_type in text_content_types) or \
2368         (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
2369        acceptable_content_type = 1
2370        true_encoding = http_encoding or 'us-ascii'
2371    elif http_content_type.startswith('text/'):
2372        true_encoding = http_encoding or 'us-ascii'
2373    elif http_headers and (not http_headers.has_key('content-type')):
2374        true_encoding = xml_encoding or 'iso-8859-1'
2375    else:
2376        true_encoding = xml_encoding or 'utf-8'
2377    return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
2378   
2379def _toUTF8(data, encoding):
2380    '''Changes an XML data stream on the fly to specify a new encoding
2381
2382    data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
2383    encoding is a string recognized by encodings.aliases
2384    '''
2385    if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
2386    # strip Byte Order Mark (if present)
2387    if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
2388        if _debug:
2389            sys.stderr.write('stripping BOM\n')
2390            if encoding != 'utf-16be':
2391                sys.stderr.write('trying utf-16be instead\n')
2392        encoding = 'utf-16be'
2393        data = data[2:]
2394    elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
2395        if _debug:
2396            sys.stderr.write('stripping BOM\n')
2397            if encoding != 'utf-16le':
2398                sys.stderr.write('trying utf-16le instead\n')
2399        encoding = 'utf-16le'
2400        data = data[2:]
2401    elif data[:3] == '\xef\xbb\xbf':
2402        if _debug:
2403            sys.stderr.write('stripping BOM\n')
2404            if encoding != 'utf-8':
2405                sys.stderr.write('trying utf-8 instead\n')
2406        encoding = 'utf-8'
2407        data = data[3:]
2408    elif data[:4] == '\x00\x00\xfe\xff':
2409        if _debug:
2410            sys.stderr.write('stripping BOM\n')
2411            if encoding != 'utf-32be':
2412                sys.stderr.write('trying utf-32be instead\n')
2413        encoding = 'utf-32be'
2414        data = data[4:]
2415    elif data[:4] == '\xff\xfe\x00\x00':
2416        if _debug:
2417            sys.stderr.write('stripping BOM\n')
2418            if encoding != 'utf-32le':
2419                sys.stderr.write('trying utf-32le instead\n')
2420        encoding = 'utf-32le'
2421        data = data[4:]
2422    newdata = unicode(data, encoding)
2423    if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
2424    declmatch = re.compile('^<\?xml[^>]*?>')
2425    newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
2426    if declmatch.search(newdata):
2427        newdata = declmatch.sub(newdecl, newdata)
2428    else:
2429        newdata = newdecl + u'\n' + newdata
2430    return newdata.encode('utf-8')
2431
2432def _stripDoctype(data):
2433    '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
2434
2435    rss_version may be 'rss091n' or None
2436    stripped_data is the same XML document, minus the DOCTYPE
2437    '''
2438    entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
2439    data = entity_pattern.sub('', data)
2440    doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
2441    doctype_results = doctype_pattern.findall(data)
2442    doctype = doctype_results and doctype_results[0] or ''
2443    if doctype.lower().count('netscape'):
2444        version = 'rss091n'
2445    else:
2446        version = None
2447    data = doctype_pattern.sub('', data)
2448    return version, data
2449   
2450def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
2451    '''Parse a feed from a URL, file, stream, or string'''
2452    result = FeedParserDict()
2453    result['feed'] = FeedParserDict()
2454    result['entries'] = []
2455    if _XML_AVAILABLE:
2456        result['bozo'] = 0
2457    if type(handlers) == types.InstanceType:
2458        handlers = [handlers]
2459    try:
2460        f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
2461        data = f.read()
2462    except Exception, e:
2463        result['bozo'] = 1
2464        result['bozo_exception'] = e
2465        data = ''
2466        f = None
2467
2468    # if feed is gzip-compressed, decompress it
2469    if f and data and hasattr(f, 'headers'):
2470        if gzip and f.headers.get('content-encoding', '') == 'gzip':
2471            try:
2472                data = gzip.GzipFile(fileobj=_StringIO(data)).read()
2473            except Exception, e:
2474                # Some feeds claim to be gzipped but they're not, so
2475                # we get garbage.  Ideally, we should re-request the
2476                # feed without the 'Accept-encoding: gzip' header,
2477                # but we don't.
2478                result['bozo'] = 1
2479                result['bozo_exception'] = e
2480                data = ''
2481        elif zlib and f.headers.get('content-encoding', '') == 'deflate':
2482            try:
2483                data = zlib.decompress(data, -zlib.MAX_WBITS)
2484            except Exception, e:
2485                result['bozo'] = 1
2486                result['bozo_exception'] = e
2487                data = ''
2488
2489    # save HTTP headers
2490    if hasattr(f, 'info'):
2491        info = f.info()
2492        result['etag'] = info.getheader('ETag')
2493        last_modified = info.getheader('Last-Modified')
2494        if last_modified:
2495            result['modified'] = _parse_date(last_modified)
2496    if hasattr(f, 'url'):
2497        result['href'] = f.url
2498        result['status'] = 200
2499    if hasattr(f, 'status'):
2500        result['status'] = f.status
2501    if hasattr(f, 'headers'):
2502        result['headers'] = f.headers.dict
2503    if hasattr(f, 'close'):
2504        f.close()
2505
2506    # there are four encodings to keep track of:
2507    # - http_encoding is the encoding declared in the Content-Type HTTP header
2508    # - xml_encoding is the encoding declared in the <?xml declaration
2509    # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
2510    # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
2511    http_headers = result.get('headers', {})
2512    result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
2513        _getCharacterEncoding(http_headers, data)
2514    if http_headers and (not acceptable_content_type):
2515        if http_headers.has_key('content-type'):
2516            bozo_message = '%s is not an XML media type' % http_headers['content-type']
2517        else:
2518            bozo_message = 'no Content-type specified'
2519        result['bozo'] = 1
2520        result['bozo_exception'] = NonXMLContentType(bozo_message)
2521       
2522    result['version'], data = _stripDoctype(data)
2523
2524    baseuri = http_headers.get('content-location', result.get('href'))
2525    baselang = http_headers.get('content-language', None)
2526
2527    # if server sent 304, we're done
2528    if result.get('status', 0) == 304:
2529        result['version'] = ''
2530        result['debug_message'] = 'The feed has not changed since you last checked, ' + \
2531            'so the server sent no data.  This is a feature, not a bug!'
2532        return result
2533
2534    # if there was a problem downloading, we're done
2535    if not data:
2536        return result
2537
2538    # determine character encoding
2539    use_strict_parser = 0
2540    known_encoding = 0
2541    tried_encodings = []
2542    # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
2543    for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
2544        if not proposed_encoding: continue
2545        if proposed_encoding in tried_encodings: continue
2546        tried_encodings.append(proposed_encoding)
2547        try:
2548            data = _toUTF8(data, proposed_encoding)
2549            known_encoding = use_strict_parser = 1
2550            break
2551        except:
2552            pass
2553    # if no luck and we have auto-detection library, try that
2554    if (not known_encoding) and chardet:
2555        try:
2556            proposed_encoding = chardet.detect(data)['encoding']
2557            if proposed_encoding and (proposed_encoding not in tried_encodings):
2558                tried_encodings.append(proposed_encoding)
2559                data = _toUTF8(data, proposed_encoding)
2560                known_encoding = use_strict_parser = 1
2561        except:
2562            pass
2563    # if still no luck and we haven't tried utf-8 yet, try that
2564    if (not known_encoding) and ('utf-8' not in tried_encodings):
2565        try:
2566            proposed_encoding = 'utf-8'
2567            tried_encodings.append(proposed_encoding)
2568            data = _toUTF8(data, proposed_encoding)
2569            known_encoding = use_strict_parser = 1
2570        except:
2571            pass
2572    # if still no luck and we haven't tried windows-1252 yet, try that
2573    if (not known_encoding) and ('windows-1252' not in tried_encodings):
2574        try:
2575            proposed_encoding = 'windows-1252'
2576            tried_encodings.append(proposed_encoding)
2577            data = _toUTF8(data, proposed_encoding)
2578            known_encoding = use_strict_parser = 1
2579        except:
2580            pass
2581    # if still no luck, give up
2582    if not known_encoding:
2583        result['bozo'] = 1
2584        result['bozo_exception'] = CharacterEncodingUnknown( \
2585            'document encoding unknown, I tried ' + \
2586            '%s, %s, utf-8, and windows-1252 but nothing worked' % \
2587            (result['encoding'], xml_encoding))
2588        result['encoding'] = ''
2589    elif proposed_encoding != result['encoding']:
2590        result['bozo'] = 1
2591        result['bozo_exception'] = CharacterEncodingOverride( \
2592            'documented declared as %s, but parsed as %s' % \
2593            (result['encoding'], proposed_encoding))
2594        result['encoding'] = proposed_encoding
2595
2596    if not _XML_AVAILABLE:
2597        use_strict_parser = 0
2598    if use_strict_parser:
2599        # initialize the SAX parser
2600        feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
2601        saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
2602        saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
2603        saxparser.setContentHandler(feedparser)
2604        saxparser.setErrorHandler(feedparser)
2605        source = xml.sax.xmlreader.InputSource()
2606        source.setByteStream(_StringIO(data))
2607        if hasattr(saxparser, '_ns_stack'):
2608            # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
2609            # PyXML doesn't have this problem, and it doesn't have _ns_stack either
2610            saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
2611        try:
2612            saxparser.parse(source)
2613        except Exception, e:
2614            if _debug:
2615                import traceback
2616                traceback.print_stack()
2617                traceback.print_exc()
2618                sys.stderr.write('xml parsing failed\n')
2619            result['bozo'] = 1
2620            result['bozo_exception'] = feedparser.exc or e
2621            use_strict_parser = 0
2622    if not use_strict_parser:
2623        feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')
2624        feedparser.feed(data)
2625    result['feed'] = feedparser.feeddata
2626    result['entries'] = feedparser.entries
2627    result['version'] = result['version'] or feedparser.version
2628    result['namespaces'] = feedparser.namespacesInUse
2629    return result
2630
2631if __name__ == '__main__':
2632    if not sys.argv[1:]:
2633        print __doc__
2634        sys.exit(0)
2635    else:
2636        urls = sys.argv[1:]
2637    zopeCompatibilityHack()
2638    from pprint import pprint
2639    for url in urls:
2640        print url
2641        print
2642        result = parse(url)
2643        pprint(result)
2644        print
2645
2646#REVISION HISTORY
2647#1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
2648#  added Simon Fell's test suite
2649#1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
2650#2.0 - 10/19/2002
2651#  JD - use inchannel to watch out for image and textinput elements which can
2652#  also contain title, link, and description elements
2653#  JD - check for isPermaLink='false' attribute on guid elements
2654#  JD - replaced openAnything with open_resource supporting ETag and
2655#  If-Modified-Since request headers
2656#  JD - parse now accepts etag, modified, agent, and referrer optional
2657#  arguments
2658#  JD - modified parse to return a dictionary instead of a tuple so that any
2659#  etag or modified information can be returned and cached by the caller
2660#2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
2661#  because of etag/modified, return the old etag/modified to the caller to
2662#  indicate why nothing is being returned
2663#2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
2664#  useless.  Fixes the problem JD was addressing by adding it.
2665#2.1 - 11/14/2002 - MAP - added gzip support
2666#2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
2667#  start_admingeneratoragent is an example of how to handle elements with
2668#  only attributes, no content.
2669#2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
2670#  also, make sure we send the User-Agent even if urllib2 isn't available.
2671#  Match any variation of backend.userland.com/rss namespace.
2672#2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
2673#2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
2674#  snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
2675#  project name
2676#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
2677#  removed unnecessary urllib code -- urllib2 should always be available anyway;
2678#  return actual url, status, and full HTTP headers (as result['url'],
2679#  result['status'], and result['headers']) if parsing a remote feed over HTTP --
2680#  this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
2681#  added the latest namespace-of-the-week for RSS 2.0
2682#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
2683#  User-Agent (otherwise urllib2 sends two, which confuses some servers)
2684#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
2685#  inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
2686#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
2687#  textInput, and also to return the character encoding (if specified)
2688#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
2689#  nested divs within content (JohnD); fixed missing sys import (JohanS);
2690#  fixed regular expression to capture XML character encoding (Andrei);
2691#  added support for Atom 0.3-style links; fixed bug with textInput tracking;
2692#  added support for cloud (MartijnP); added support for multiple
2693#  category/dc:subject (MartijnP); normalize content model: 'description' gets
2694#  description (which can come from description, summary, or full content if no
2695#  description), 'content' gets dict of base/language/type/value (which can come
2696#  from content:encoded, xhtml:body, content, or fullitem);
2697#  fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
2698#  tracking; fixed bug tracking unknown tags; fixed bug tracking content when
2699#  <content> element is not in default namespace (like Pocketsoap feed);
2700#  resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
2701#  wfw:commentRSS; resolve relative URLs within embedded HTML markup in
2702#  description, xhtml:body, content, content:encoded, title, subtitle,
2703#  summary, info, tagline, and copyright; added support for pingback and
2704#  trackback namespaces
2705#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
2706#  namespaces, as opposed to 2.6 when I said I did but didn't really;
2707#  sanitize HTML markup within some elements; added mxTidy support (if
2708#  installed) to tidy HTML markup within some elements; fixed indentation
2709#  bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
2710#  (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
2711#  'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
2712#  'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
2713#  and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
2714#2.7.1 - 1/9/2004 - MAP - fixed bug handling &quot; and &apos;.  fixed memory
2715#  leak not closing url opener (JohnD); added dc:publisher support (MarekK);
2716#  added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
2717#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
2718#  encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
2719#  fixed relative URI processing for guid (skadz); added ICBM support; added
2720#  base64 support
2721#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
2722#  blogspot.com sites); added _debug variable
2723#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
2724#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
2725#  added several new supported namespaces; fixed bug tracking naked markup in
2726#  description; added support for enclosure; added support for source; re-added
2727#  support for cloud which got dropped somehow; added support for expirationDate
2728#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
2729#  xml:base URI, one for documents that don't define one explicitly and one for
2730#  documents that define an outer and an inner xml:base that goes out of scope
2731#  before the end of the document
2732#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
2733#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']
2734#  will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
2735#  added support for creativeCommons:license and cc:license; added support for
2736#  full Atom content model in title, tagline, info, copyright, summary; fixed bug
2737#  with gzip encoding (not always telling server we support it when we do)
2738#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
2739#  (dictionary of 'name', 'url', 'email'); map author to author_detail if author
2740#  contains name + email address
2741#3.0b8 - 1/28/2004 - MAP - added support for contributor
2742#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
2743#  support for summary
2744#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
2745#  xml.util.iso8601
2746#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
2747#  dangerous markup; fiddled with decodeEntities (not right); liberalized
2748#  date parsing even further
2749#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
2750#  added support to Atom 0.2 subtitle; added support for Atom content model
2751#  in copyright; better sanitizing of dangerous HTML elements with end tags
2752#  (script, frameset)
2753#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
2754#  etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
2755#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
2756#  Python 2.1
2757#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
2758#  fixed bug capturing author and contributor URL; fixed bug resolving relative
2759#  links in author and contributor URL; fixed bug resolvin relative links in
2760#  generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
2761#  namespace tests, and included them permanently in the test suite with his
2762#  permission; fixed namespace handling under Python 2.1
2763#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
2764#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
2765#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
2766#  use libxml2 (if available)
2767#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
2768#  name was in parentheses; removed ultra-problematic mxTidy support; patch to
2769#  workaround crash in PyXML/expat when encountering invalid entities
2770#  (MarkMoraes); support for textinput/textInput
2771#3.0b20 - 4/7/2004 - MAP - added CDF support
2772#3.0b21 - 4/14/2004 - MAP - added Hot RSS support
2773#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
2774#  results dict; changed results dict to allow getting values with results.key
2775#  as well as results[key]; work around embedded illformed HTML with half
2776#  a DOCTYPE; work around malformed Content-Type header; if character encoding
2777#  is wrong, try several common ones before falling back to regexes (if this
2778#  works, bozo_exception is set to CharacterEncodingOverride); fixed character
2779#  encoding issues in BaseHTMLProcessor by tracking encoding and converting
2780#  from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
2781#  convert each value in results to Unicode (if possible), even if using
2782#  regex-based parsing
2783#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
2784#  high-bit characters in attributes in embedded HTML in description (thanks
2785#  Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
2786#  FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
2787#  about a mapped key
2788#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
2789#  results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
2790#  cause the same encoding to be tried twice (even if it failed the first time);
2791#  fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
2792#  better textinput and image tracking in illformed RSS 1.0 feeds
2793#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
2794#  my blink tag tests
2795#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
2796#  failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
2797#  duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
2798#  added support for image; refactored parse() fallback logic to try other
2799#  encodings if SAX parsing fails (previously it would only try other encodings
2800#  if re-encoding failed); remove unichr madness in normalize_attrs now that
2801#  we're properly tracking encoding in and out of BaseHTMLProcessor; set
2802#  feed.language from root-level xml:lang; set entry.id from rdf:about;
2803#  send Accept header
2804#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
2805#  iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
2806#  windows-1252); fixed regression that could cause the same encoding to be
2807#  tried twice (even if it failed the first time)
2808#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
2809#  recover from malformed content-type header parameter with no equals sign
2810#  ('text/xml; charset:iso-8859-1')
2811#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
2812#  to Unicode equivalents in illformed feeds (aaronsw); added and
2813#  passed tests for converting character entities to Unicode equivalents
2814#  in illformed feeds (aaronsw); test for valid parsers when setting
2815#  XML_AVAILABLE; make version and encoding available when server returns
2816#  a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
2817#  digest auth or proxy support); add code to parse username/password
2818#  out of url and send as basic authentication; expose downloading-related
2819#  exceptions in bozo_exception (aaronsw); added __contains__ method to
2820#  FeedParserDict (aaronsw); added publisher_detail (aaronsw)
2821#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
2822#  convert feed to UTF-8 before passing to XML parser; completely revamped
2823#  logic for determining character encoding and attempting XML parsing
2824#  (much faster); increased default timeout to 20 seconds; test for presence
2825#  of Location header on redirects; added tests for many alternate character
2826#  encodings; support various EBCDIC encodings; support UTF-16BE and
2827#  UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
2828#  UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
2829#  XML parsers are available; added support for 'Content-encoding: deflate';
2830#  send blank 'Accept-encoding: ' header if neither gzip nor zlib modules
2831#  are available
2832#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
2833#  problem tracking xml:base and xml:lang if element declares it, child
2834#  doesn't, first grandchild redeclares it, and second grandchild doesn't;
2835#  refactored date parsing; defined public registerDateHandler so callers
2836#  can add support for additional date formats at runtime; added support
2837#  for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
2838#  zopeCompatibilityHack() which turns FeedParserDict into a regular
2839#  dictionary, required for Zope compatibility, and also makes command-
2840#  line debugging easier because pprint module formats real dictionaries
2841#  better than dictionary-like objects; added NonXMLContentType exception,
2842#  which is stored in bozo_exception when a feed is served with a non-XML
2843#  media type such as 'text/plain'; respect Content-Language as default
2844#  language if not xml:lang is present; cloud dict is now FeedParserDict;
2845#  generator dict is now FeedParserDict; better tracking of xml:lang,
2846#  including support for xml:lang='' to unset the current language;
2847#  recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
2848#  namespace; don't overwrite final status on redirects (scenarios:
2849#  redirecting to a URL that returns 304, redirecting to a URL that
2850#  redirects to another URL with a different type of redirect); add
2851#  support for HTTP 303 redirects
2852#4.0 - MAP - support for relative URIs in xml:base attribute; fixed
2853#  encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;
2854#  support for Atom 1.0; support for iTunes extensions; new 'tags' for
2855#  categories/keywords/etc. as array of dict
2856#  {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0
2857#  terminology; parse RFC 822-style dates with no time; lots of other
2858#  bug fixes
2859#4.1 - MAP - removed socket timeout; added support for chardet library
Note: See TracBrowser for help on using the repository browser.