ignore encoding error when parsing pdf with pdfminer

ignore encoding error when parsing pdf with pdfminer - python

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
fn='test.pdf'
with open(fn, mode='rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog['AcroForm'])['Fields']
item = {}
for i in fields:
field = resolve1(i)
name, value = field.get('T'), field.get('V')
item[name]=value
Hello, I need help with this code as it is giving me Unicode error on some characters
Traceback (most recent call last):
File "<stdin>", line 7, in <module>
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdftypes.py", line 80, in resolve1
x = x.resolve(default=default)
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdftypes.py", line 67, in resolve
return self.doc.getobj(self.objid)
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdfdocument.py", line 673, in getobj
stream = stream_value(self.getobj(strmid))
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdfdocument.py", line 676, in getobj
obj = self._getobj_parse(index, objid)
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/pdfdocument.py", line 648, in _getobj_parse
raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
File "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/psparser.py", line 85, in __repr__
return self.name.decode('ascii')
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128)
is there anything I can add so it "ingores" the charchters that its not able to decode or at least return the name with the value as blank in name, value = field.get('T'), field.get('V').
any help is appreciated

Here is one way you can fix it
nano "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/psparser.py"
then in line 85
def __repr__(self):
return self.name.decode('ascii', 'ignore') # this fixes it
I don't believe it's recommended to edit source scripts, you should also post an issue on Github

Related

UnicodeDecodeError in converting shapefile into GeoJSON file

I've tried to convert 'shapefile' into 'geojson' format using python and this is my code
import shapefile
import json
path = "shapefiles/WTL_VALV_PS"
sf = shapefile.Reader(path)
fields = sf.fields[1:]
field_names = [field[0] for field in fields]
buffer = []
for sr in sf.shapeRecords():
atr = dict(zip(field_names, sr.record))
geom = sr.shape.__geo_interface__
buffer.append(dict(type="Feature", geometry=geom, properties=atr))
geojson = open("test4.geojson", "w", encoding='utf-8')
geojson.write(json.dumps({"type": "FeatureCollection", "features": buffer}, indent=2, ensure_ascii=False))
geojson.close()
but I got this error
Traceback (most recent call last):
File "C:/Users/user/PycharmProjects/ConvertGeoJSON/geoJSON.py", line 16, in <module>
for sr in sf.shapeRecords():
File "C:\Users\user\PycharmProjects\ConvertGeoJSON\venv\lib\site-packages\shapefile.py", line 1039, in shapeRecords
for rec in zip(self.shapes(), self.records())])
File "C:\Users\user\PycharmProjects\ConvertGeoJSON\venv\lib\site-packages\shapefile.py", line 1012, in records
r = self.__record(oid=i)
File "C:\Users\user\PycharmProjects\ConvertGeoJSON\venv\lib\site-packages\shapefile.py", line 987, in __record
value = u(value, self.encoding, self.encodingErrors)
File "C:\Users\user\PycharmProjects\ConvertGeoJSON\venv\lib\site-packages\shapefile.py", line 104, in u
return v.decode(encoding, encodingErrors)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc1 in position 1: invalid start byte
I thought it is becuase shapefile contains 'Korean character' but I could convert file with Arabic character like 'أفغانستان ' and of course I could convert file with English.
I lost my way and I don't know where I'm supposed to start from

Unable to decode yml file ... utf8' codec can't decode byte #xa0: invalid start byte

I'm trying to read YAML file and convert it into dictionary file. I'm seeing an issue while loading the file into dict variable.
I tried to search for similar issues. One of the replies in stackoverflow was to replace each character '\\xa0' with ' '. I tried do that line = line.replace('\\xa0',' '). This program doesn't work on Python 2.7 version. I tried using Python 3 it works fine.
import yaml
import sys
yaml_dir = "/root/tools/test_case/"
#file_name = "TC_CFD_SR.yml"
file_name = "TC_QB.yml"
tc_file_name = yaml_dir + file_name
def write(file,content):
file = open(file,'a')
file.write(content)
file.close()
def verifyYmlFile(yml_file):
data = {}
with open(yml_file, 'r') as fin:
for line in fin:
line = line.replace('\\xa0',' ')
write('anand-yaml.yml',line)
with open('anand-yaml.yml','r') as fin:
data = yaml.load(fin)
return data
if __name__ == '__main__':
data = {}
print "verifying yaml"
data= verifyYmlFile(tc_file_name)
Error:
[root#anand-harness test_case]# python verify_yaml.py
verifying yaml
Traceback (most recent call last):
File "verify_yaml.py", line 29, in <module>
data= verifyYmlFile(tc_file_name)
File "verify_yaml.py", line 23, in verifyYmlFile
data = yaml.load(fin)
File "/usr/lib64/python2.6/site-packages/yaml/__init__.py", line 71, in load
return loader.get_single_data()
File "/usr/lib64/python2.6/site-packages/yaml/constructor.py", line 37, in get_single_data
node = self.get_single_node()
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 36, in get_single_node
document = self.compose_document()
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 55, in compose_document
node = self.compose_node(None, None)
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 82, in compose_node
node = self.compose_sequence_node(anchor)
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 111, in compose_sequence_node
node.value.append(self.compose_node(node, index))
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 84, in compose_node
node = self.compose_mapping_node(anchor)
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 133, in compose_mapping_node
item_value = self.compose_node(node, item_key)
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 64, in compose_node
if self.check_event(AliasEvent):
File "/usr/lib64/python2.6/site-packages/yaml/parser.py", line 98, in check_event
self.current_event = self.state()
File "/usr/lib64/python2.6/site-packages/yaml/parser.py", line 449, in parse_block_mapping_value
if not self.check_token(KeyToken, ValueToken, BlockEndToken):
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 116, in check_token
self.fetch_more_tokens()
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 244, in fetch_more_tokens
return self.fetch_single()
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 653, in fetch_single
self.fetch_flow_scalar(style='\'')
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 667, in fetch_flow_scalar
self.tokens.append(self.scan_flow_scalar(style))
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 1156, in scan_flow_scalar
chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 1196, in scan_flow_scalar_non_spaces
while self.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
File "/usr/lib64/python2.6/site-packages/yaml/reader.py", line 91, in peek
self.update(index+1)
File "/usr/lib64/python2.6/site-packages/yaml/reader.py", line 165, in update
exc.encoding, exc.reason)
yaml.reader.ReaderError: 'utf8' codec can't decode byte #xa0: invalid start byte
in "anand-yaml.yml", position 3246
What am I missing?

The character sequence "\\xa0" is not the problem that you see in the message, the problem is the sequence "\xa0" (note that the backslash is not escaped).
You replacement line should be:
line = line.replace('\xa0',' ')
to circumvent the problem.
If you know what the format is you can do the correct conversion yourself, but that should not be necessary and that or the above patching is not a structural solution. It would be best if the YAML file was generated in a correct way (they default to UTF-8, so it should contain correct UTF-8). It could UTF-16 without the appropriate BOM (which the yaml library interprets IIRC).
s1 = 'abc\\xa0xyz'
print(repr(s1))
u1 = s1.decode('utf-8') # this works fine
s = 'abc\xa0xyz'
print(repr(s))
u = s.decode('utf-8') # this throws an error

HTMLParser or urllib2 unicode issue

I am trying to use HTMLParser and urllib2 to get to an image file
content = urllib2.urlopen( imgurl.encode('utf-8') ).read()
try:
p = MyHTMLParser( )
p.feed( content )
p.download_file( )
p.close()
except Exception,e:
print e
MyHTMLParser:
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.url=""
self.outfile = "some.png"
def download_file(self):
urllib.urlretrieve( self.url, self.outfile )
def handle_starttag(self, tag, attrs):
if tag == "a":
# after some manipulation here, self.url will have a img url
self.url = "http://somewhere.com/Fondue%C3%A0.png"
when i run the script, i get
Traceback (most recent call last):
File "test.py", line 59, in <module>
p.feed( data )
File "/usr/lib/python2.7/HTMLParser.py", line 114, in feed
self.goahead(0)
File "/usr/lib/python2.7/HTMLParser.py", line 158, in goahead
k = self.parse_starttag(i)
File "/usr/lib/python2.7/HTMLParser.py", line 305, in parse_starttag
attrvalue = self.unescape(attrvalue)
File "/usr/lib/python2.7/HTMLParser.py", line 472, in unescape
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
File "/usr/lib/python2.7/re.py", line 151, in sub
return _compile(pattern, flags).sub(repl, string, count)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 56: ordinal not in range(128)
Using the suggestions i found in the found, i did the .encode('utf-8') method, but it still gives me error. how to fix this ? thanks

Replace
content = urllib2.urlopen( url.encode('utf-8') ).read()
with
content = urllib2.urlopen(url).read().decode('utf-8')
To decode the response into unicode.

Scrapy response.replace encoding error

I am trying to replace the response body of a search result block of a search result page of google using response.replace() and I face some encoding issues.
scrapy shell "http://www.google.de/search?q=Zuckerccc"
>>> srb = hxs.select("//li[#class='g']").extract()
>>> body = '<html><body>' + srb[0] + '</body></html>' # get only 1st search result block
>>> b = response.replace(body = body)
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "scrapy/lib/python2.6/site-packages/scrapy/http/response/text.py", line 54, in replace
return Response.replace(self, *args, **kwargs)
File "scrapy/lib/python2.6/site-packages/scrapy/http/response/__init__.py", line 77, in replace
return cls(*args, **kwargs)
File "scrapy/lib/python2.6/site-packages/scrapy/http/response/text.py", line 31, in __init__
super(TextResponse, self).__init__(*args, **kwargs)
File "scrapy/lib/python2.6/site-packages/scrapy/http/response/__init__.py", line 19, in __init__
self._set_body(body)
File "scrapy/lib/python2.6/site-packages/scrapy/http/response/text.py", line 48, in _set_body
self._body = body.encode(self._encoding)
File "../local_1/Linux-2.6c2.5-x86_64/Python/Python-147.0-0/lib/python2.6/encodings/cp1252.py", line 12, in encode
return codecs.charmap_encode(input,errors,encoding_table)
UnicodeEncodeError: 'charmap' codec can't encode character u'\u0131' in position 529: character maps to <undefined>
I tried to create my own response as well,
>>> x = HtmlResponse("http://www.google.de/search?q=Zuckerccc", body = body, encoding = response.encoding)
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "scrapy/lib/python2.6/site-packages/scrapy/http/response/text.py", line 31, in __init__
super(TextResponse, self).__init__(*args, **kwargs)
self._set_body(body)
File "scrapy/lib/python2.6/site-packages/scrapy/http/response/text.py", line 48, in _set_body
self._body = body.encode(self._encoding)
File "../local_1/Linux-2.6c2.5-x86_64/Python/Python-147.0-0/lib/python2.6/encodings/cp1252.py", line 12, in encode
return codecs.charmap_encode(input,errors,encoding_table)
UnicodeEncodeError: 'charmap' codec can't encode character u'\u0131' in position 529: character maps to <undefined>
File "scrapy/lib/python2.6/site-packages/scrapy/http/response/__init__.py", line 19, in __init__
Also, when I use _body_declared_encoding() for encoding in replace() function, it works.
replace(body = body, encoding = response._body_declared_encoding())
I don't understand why response._body_declared_encoding() and response.encoding are different. Can anybody please shed some light on this.
So, what will be a good way to fix this ?

I successfully replaced the response body with these lines of code:
scrapy shell "http://www.google.de/search?q=Zuckerccc"
>>> google_result = response.xpath('//li[#class="g"]').extract()[0]
>>> body = '<html><body>' + google_result + '</body></html>'
>>> b = response.replace(body = body)

I check the source code from scrapy.http.response.text , when we use TextResponse, we need to tell self._encoding first. So we can do like this:
>>>response._encoding='utf8'
>>>response._set_body("aaaaaa")
>>>response.body
>>>'aaaaaa'

Trouble using gdata and Unicode Cyrillic in Python

I have this code
# -*- coding: utf8 -*-
__author__ = 'user'
import gdata.youtube.service
yt_service = gdata.youtube.service.YouTubeService()
query = gdata.youtube.service.YouTubeVideoQuery()
query.vq = u"не"
feed = yt_service.YouTubeQuery(query)
for yt_item in feed.entry:
print yt_item.GetSwfUrl()
And I am getting this error:
Traceback (most recent call last):
File "cyr_search.py", line 7, in
feed = yt_service.YouTubeQuery(query)
File "/Users/user/Documents/GrabaHeroku/graba_h_ve/lib/python2.7/site-packages/gdata/youtube/service.py", line 1346, in YouTubeQuery
result = self.Query(query.ToUri())
File "/Users/user/Documents/GrabaHeroku/graba_h_ve/lib/python2.7/site-packages/gdata/service.py", line 1715, in ToUri
return atom.service.BuildUri(q_feed, self)
File "/Users/user/Documents/GrabaHeroku/graba_h_ve/lib/python2.7/site-packages/atom/service.py", line 584, in BuildUri
parameter_list = DictionaryToParamList(url_params, escape_params)
File "/Users/user/Documents/GrabaHeroku/graba_h_ve/lib/python2.7/site-packages/atom/service.py", line 551, in DictionaryToParamList
for param, value in (url_parameters or {}).items()]
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 1275, in quote_plus
return quote(s, safe)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 1268, in quote
return ''.join(map(quoter, s))
KeyError: u'\u043d'
How do I search for non-ASCII. Do I need to url encode the query? I thought the library will do that on its own.

Change to:
query.vq = u"не".encode('utf8')
The string needs to be encoded before being sent.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

ignore encoding error when parsing pdf with pdfminer - python

Here is one way you can fix it nano "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/psparser.py" then in line 85 def repr(self): return self.name.decode('ascii', 'ignore') # this fixes it I don't believe it's recommended to edit source scripts, you should also post an issue on Github

Related

UnicodeDecodeError in converting shapefile into GeoJSON file

Unable to decode yml file ... utf8' codec can't decode byte #xa0: invalid start byte

HTMLParser or urllib2 unicode issue

Scrapy response.replace encoding error

Trouble using gdata and Unicode Cyrillic in Python

Categories

Resources

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

ignore encoding error when parsing pdf with pdfminer - python

Here is one way you can fix it nano "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/psparser.py" then in line 85 def __repr__(self): return self.name.decode('ascii', 'ignore') # this fixes it I don't believe it's recommended to edit source scripts, you should also post an issue on Github

Related

UnicodeDecodeError in converting shapefile into GeoJSON file

Unable to decode yml file ... utf8' codec can't decode byte #xa0: invalid start byte

HTMLParser or urllib2 unicode issue

Scrapy response.replace encoding error

Trouble using gdata and Unicode Cyrillic in Python

Categories

Resources

Here is one way you can fix it nano "/home/timmy/.local/lib/python3.8/site-packages/pdfminer/psparser.py" then in line 85 def repr(self): return self.name.decode('ascii', 'ignore') # this fixes it I don't believe it's recommended to edit source scripts, you should also post an issue on Github