Question
I'm using HTMLParser to parse pages I pull down with urllib, and am coming across UnicodeDecodeError exceptions when passing some to HTMLParser. The information is there if I just print it out.
- from HTMLParser import HTMLParser
- import urllib
- import chardet
- class search_youtube(HTMLParser):
- def __init__(self, search_terms):
- HTMLParser.__init__(self)
- self.track_ids = []
- for search in search_terms:
- self.__in_result = False
- search = urllib.quote_plus(search)
- query = 'http://youtube.com/results?search_query='
- page = urllib.urlopen(query + search).read()
- try:
- self.feed(page)
- except UnicodeDecodeError:
- encoding = chardet.detect(page)['encoding']
- if encoding != 'unicode':
- page = page.decode(encoding)
- page = page.encode('ascii', 'ignore')
- self.feed(page)
- print 'success'
- searches = ['telepopmusik breathe']
- results = search_youtube(searches)
- print results.track_ids
- Traceback (most recent call last):
- File "test.py", line 27, in
- results = search_youtube(searches)
- File "test.py", line 23, in __init__
- self.feed(page)
- File "/usr/lib/python2.6/HTMLParser.py", line 108, in feed
- self.goahead(0)
- File "/usr/lib/python2.6/HTMLParser.py", line 148, in goahead
- k = self.parse_starttag(i)
- File "/usr/lib/python2.6/HTMLParser.py", line 252, in parse_starttag
- attrvalue = self.unescape(attrvalue)
- File "/usr/lib/python2.6/HTMLParser.py", line 390, in unescape
- return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
- File "/usr/lib/python2.6/re.py", line 151, in sub
- return _compile(pattern, 0).sub(repl, string, count)
- UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 1: ordinal not in range(128)
It is UTF-8, indeed. This works:
- from HTMLParser import HTMLParser
- import urllib
- class search_youtube(HTMLParser):
- def __init__(self, search_terms):
- HTMLParser.__init__(self)
- self.track_ids = []
- for search in search_terms:
- self.__in_result = False
- search = urllib.quote_plus(search)
- query = 'http://youtube.com/results?search_query='
- connection = urllib.urlopen(query + search)
- encoding = connection.headers.getparam('charset')
- if encoding:
- page = connection.read().decode(encoding)
- else:
- page = connection.read()
- self.feed(page)
- print 'success'
- searches = ['telepopmusik breathe']
- results = search_youtube(searches)
- print results.track_ids
沒有留言:
張貼留言