Decoding several text files from Byte to UTF-8 - python

I'm currently trying to loop over roughly 9000 .txt files in python to extract data and add them to a joined pandas data frame. The .txt data is stored in bytes, so in order to access it I was told to use a decoder. Because I'm interested in preserving special characters, I would like to use the UTF-8 decoder, but I'm getting the following error when trying to do so:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte
For some reason, the code works just fine when using a 'ISO-8859-1' decoder, but this obviously messes up all special characters. Does anyone know how to fix this? I'm pasting my code below! Also, the decoding works for the first ~1600 .txt files in my dataset, but for the rest it doesn't.
decode_counter = 0
for index, i in enumerate(corpus[0]):
decode_counter += 1
corpus.iloc[index][0] = i.decode('UTF-8')
The corpus variable contains the name of the .txt file as an index, and the contents of the individual .txt files in a column named 0. Thank you very much!

Maybe you could try every codec available in your environment and check which result fits best.
Here is a way of doing that:
import os, codecs, encodings
from collections import OrderedDict
from typing import Union
from cprinter import TC
from input_timeout import InputTimeout
class CodecChecker:
def __init__(self):
self.encodingdict = self.get_codecs()
self.results = OrderedDict()
def get_codecs(self):
dir = encodings.__path__[0]
codec_names = OrderedDict()
for filename in os.listdir(dir):
if not filename.endswith(".py"):
continue
name = filename[:-3]
try:
codec_names[name] = OrderedDict({"object": codecs.lookup(name)})
except Exception as Fehler:
pass
return codec_names
def try_open_file(self, path: str, readlines: int = 0):
self.results = OrderedDict()
results = OrderedDict()
if readlines == 0:
for key, item in self.encodingdict.items():
results[key] = {"strict_encoded": [], "strict_bad": True}
try:
with open(path, encoding=key) as f:
data = f.read()
results[key]["strict_encoded"].append(data)
results[key]["strict_bad"] = False
except Exception as fe:
results[key]["strict_encoded"].append(str(fe))
continue
else:
for key, item in self.encodingdict.items():
results[key] = {"strict_encoded": [], "strict_bad": True}
try:
with open(path, encoding=key) as f:
for ini, line in enumerate(f.readlines()):
if ini == readlines:
break
results[key]["strict_encoded"].append(line[:-1])
results[key]["strict_bad"] = False
except Exception as fe:
results[key]["strict_encoded"].append(str(fe))
continue
self.results = results.copy()
return self
def try_convert_bytes(self, variable: bytes):
self.results = OrderedDict()
results = OrderedDict()
modes = ["strict", "ignore", "replace"]
for key, item in self.encodingdict.items():
results[key] = {
"strict_encoded": [],
"strict_bad": True,
"ignore_encoded": [],
"ignore_bad": True,
"replace_encoded": [],
"replace_bad": True,
}
for mo in modes:
try:
results[key][f"{mo}_encoded"].append(
item["object"].decode(variable, mo)
)
results[key][f"{mo}_bad"] = False
except Exception as Fe:
results[key][f"{mo}_encoded"].append(str(Fe))
self.results = results.copy()
return self
def print_results(
self, pause_after_interval: Union[int, float] = 0, items_per_interval: int = 0
):
counter = 0
for key, item in self.results.items():
if pause_after_interval != 0 and items_per_interval != 0:
if items_per_interval == counter and counter > 0:
i = InputTimeout(
timeout=pause_after_interval,
input_message=f"Press any key to continue or wait {pause_after_interval} seconds",
timeout_message="",
defaultvalue="",
cancelbutton=None,
show_special_characters_warning=None,
).finalvalue
counter = 0
print(
f'\n\n\n{"Codec".ljust(20)}: {str(TC(key).bg_cyan.fg_black)}'.ljust(100)
)
if "strict_bad" in item and "strict_encoded" in item:
print(f'{"Mode".ljust(20)}: {TC("strict").fg_yellow.bg_black}')
if item["strict_bad"] is False:
if isinstance(item["strict_encoded"][0], tuple):
if item["strict_bad"] is False:
try:
print(
f"""{'Length'.ljust(20)}: {TC(f'''{item['strict_encoded'][0][1]}''').fg_purple.bg_black}\n{'Converted'.ljust(20)}: {TC(f'''{item['strict_encoded'][0][0]}''').fg_green.bg_black}"""
)
except Exception:
print(
f"""Problems during printing! Raw string: {item['strict_encoded'][0][0]!r}"""
)
if item["strict_bad"] is True:
try:
print(
f"""{'Length'.ljust(20)}: {TC(f'''{"None"}''').fg_red.bg_black}\n{'Converted'.ljust(20)}: {TC(f'''{item['strict_encoded'][0]}''').fg_red.bg_black}"""
)
except Exception:
print(
f"""Problems during printing! Raw string: {item['strict_encoded'][0][0]!r}"""
)
if isinstance(item["strict_encoded"][0], str):
if item["strict_bad"] is False:
itemlen = len("".join(item["strict_encoded"]))
concatitem = "\n" + "\n".join(
[
f"""Line: {str(y).ljust(14)} {str(f'''{x}''')}"""
for y, x in enumerate(item["strict_encoded"])
]
)
try:
print(
f"""{'Length'.ljust(20)}: {TC(f'''{itemlen}''').fg_purple.bg_black}\n{'Converted'.ljust(20)}: {concatitem}"""
)
except Exception:
print(
f"""Problems during printing! Raw string: {concatitem!r}"""
)
if item["strict_bad"] is True:
concatitem = TC(
" ".join(item["strict_encoded"])
).fg_red.bg_black
try:
print(
f"""{'Length'.ljust(20)}: {TC(f'''{"None"}''').fg_red.bg_black}\n{'Converted'.ljust(20)}: {concatitem}"""
)
except Exception:
print(
f"""Problems during printing! Raw string: {concatitem!r}"""
)
print("")
if "ignore_bad" in item and "ignore_encoded" in item:
print(f'{"Mode".ljust(20)}: {TC("ignore").fg_yellow.bg_black}')
if item["ignore_bad"] is False:
if isinstance(item["ignore_encoded"][0], tuple):
if item["ignore_bad"] is False:
try:
print(
f"""{'Length'.ljust(20)}: {TC(f'''{item['ignore_encoded'][0][1]}''').bg_black.fg_lightgrey}\n{'Converted'.ljust(20)}: {TC(f'''{item['ignore_encoded'][0][0]}''').bg_black.fg_lightgrey}"""
)
except Exception:
print(
f"""Problems during printing! Raw string: {item['ignore_encoded'][0][0]!r}"""
)
print("")
if "replace_bad" in item and "replace_encoded" in item:
print(f'{"Mode".ljust(20)}: {TC("replace").fg_yellow.bg_black}')
if item["replace_bad"] is False:
if isinstance(item["replace_encoded"][0], tuple):
if item["replace_bad"] is False:
try:
print(
f"""{'Length'.ljust(20)}: {TC(f'''{item['replace_encoded'][0][1]}''').bg_black.fg_lightgrey}\n{'Converted'.ljust(20)}: {TC(f'''{item['replace_encoded'][0][0]}''').bg_black.fg_lightgrey}"""
)
except Exception:
print(
f"""Problems during printing! Raw string: {item['replace_encoded'][0][0]!r}"""
)
counter = counter + 1
return self
if __name__ == "__main__":
teststuff = b"""This is a test!
Hi there!
A little test! """
testfilename = "test_utf8.tmp"
with open("test_utf8.tmp", mode="w", encoding="utf-8-sig") as f:
f.write(teststuff.decode("utf-8-sig"))
codechecker = CodecChecker()
codechecker.try_open_file(testfilename, readlines=2).print_results(
pause_after_interval=1, items_per_interval=10
)
codechecker.try_open_file(testfilename).print_results()
codechecker.try_convert_bytes(teststuff.decode("cp850").encode()).print_results(
pause_after_interval=1, items_per_interval=10
)
Or you simply run a script to replace all messed up characters. Since I am a German teacher, I have this problem frequently (encoding problems due to Umlaut). Here is a script to replace all characters (too big to post the script here): https://github.com/hansalemaos/LatinFixer/blob/main/__init__.py

Related

How can I access the content of a registry key in Python? [duplicate]

from _winreg import *
"""print r"*** Reading from SOFTWARE\Microsoft\Windows\CurrentVersion\Run ***" """
aReg = ConnectRegistry(None,HKEY_LOCAL_MACHINE)
aKey = OpenKey(aReg, r"SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall")
for i in range(1024):
try:
asubkey=EnumKey(aKey,i)
val=QueryValueEx(asubkey, "DisplayName")
print val
except EnvironmentError:
break
Could anyone please correct the error...i just want to display the "DisplayName" within the subkeys of the key the HKLM\SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall
This is the error i get..
Traceback (most recent call last):
File "C:/Python25/ReadRegistry", line 10, in <module>
val=QueryValueEx(asubkey, "DisplayName")
TypeError: The object is not a PyHKEY object
Documentation says that EnumKey returns string with key's name. You have to explicitly open it with winreg.OpenKey function. I've fixed your code snippet:
import winreg
aReg = winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE)
print(r"*** Reading from %s ***" % aKey)
aKey = winreg.OpenKey(aReg, r'SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall')
for i in range(1024):
try:
aValue_name = winreg.EnumKey(aKey, i)
oKey = winreg.OpenKey(aKey, aValue_name)
sValue = winreg.QueryValueEx(oKey, "DisplayName")
print(sValue)
except EnvironmentError:
break
Please note, that not every key has "DisplayName" value available.
What about x86 on x64? Use 64-bit Specific Types
What if there's more than 1024 sub-keys in "Uninstall"? Use _winreg.QueryInfoKey(key)
Python 2:
import errno, os, _winreg
proc_arch = os.environ['PROCESSOR_ARCHITECTURE'].lower()
proc_arch64 = os.environ['PROCESSOR_ARCHITEW6432'].lower()
if proc_arch == 'x86' and not proc_arch64:
arch_keys = {0}
elif proc_arch == 'x86' or proc_arch == 'amd64':
arch_keys = {_winreg.KEY_WOW64_32KEY, _winreg.KEY_WOW64_64KEY}
else:
raise Exception("Unhandled arch: %s" % proc_arch)
for arch_key in arch_keys:
key = _winreg.OpenKey(_winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall", 0, _winreg.KEY_READ | arch_key)
for i in xrange(0, _winreg.QueryInfoKey(key)[0]):
skey_name = _winreg.EnumKey(key, i)
skey = _winreg.OpenKey(key, skey_name)
try:
print _winreg.QueryValueEx(skey, 'DisplayName')[0]
except OSError as e:
if e.errno == errno.ENOENT:
# DisplayName doesn't exist in this skey
pass
finally:
skey.Close()
Python 3:
import errno, os, winreg
proc_arch = os.environ['PROCESSOR_ARCHITECTURE'].lower()
proc_arch64 = os.environ['PROCESSOR_ARCHITEW6432'].lower()
if proc_arch == 'x86' and not proc_arch64:
arch_keys = {0}
elif proc_arch == 'x86' or proc_arch == 'amd64':
arch_keys = {winreg.KEY_WOW64_32KEY, winreg.KEY_WOW64_64KEY}
else:
raise Exception("Unhandled arch: %s" % proc_arch)
for arch_key in arch_keys:
key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall", 0, winreg.KEY_READ | arch_key)
for i in range(0, winreg.QueryInfoKey(key)[0]):
skey_name = winreg.EnumKey(key, i)
skey = winreg.OpenKey(key, skey_name)
try:
print(winreg.QueryValueEx(skey, 'DisplayName')[0])
except OSError as e:
if e.errno == errno.ENOENT:
# DisplayName doesn't exist in this skey
pass
finally:
skey.Close()
As it says in the _winreg.QueryValueEx documentation, you need to pass an already open key. EnumKey returns a string, not an open key.
aReg = ConnectRegistry(None,HKEY_LOCAL_MACHINE)
aKey = OpenKey(aReg, r"SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall")
for i in range(1024):
try:
keyname = EnumKey(aKey, i)
asubkey = OpenKey(aKey, keyname)
val = QueryValueEx(asubkey, "DisplayName")
print val
except WindowsError:
break
I simplified _winreg functionality for querying a given registry key's nested values.
For instance, this is how straight-forward it is to query the registry key you asked about:
key = r'HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall'
for sub_key in get_sub_keys(key):
path = join(key, sub_key)
value = get_values(path, ['DisplayName', 'DisplayVersion', 'InstallDate'])
if value:
print value
output
{'DisplayVersion': u'347.25', 'DisplayName': u'NVIDIA Control Panel 347.25', 'InstallDate': u'20150125'}
{'DisplayVersion': u'347.25', 'DisplayName': u'NVIDIA Graphics Driver 347.25', 'InstallDate': u'20150125'}
{'DisplayVersion': u'2.2.2', 'DisplayName': u'NVIDIA GeForce Experience 2.2.2', 'InstallDate': u'20150212'}
...
Add these utility functions as well:
from _winreg import *
import os
roots_hives = {
"HKEY_CLASSES_ROOT": HKEY_CLASSES_ROOT,
"HKEY_CURRENT_USER": HKEY_CURRENT_USER,
"HKEY_LOCAL_MACHINE": HKEY_LOCAL_MACHINE,
"HKEY_USERS": HKEY_USERS,
"HKEY_PERFORMANCE_DATA": HKEY_PERFORMANCE_DATA,
"HKEY_CURRENT_CONFIG": HKEY_CURRENT_CONFIG,
"HKEY_DYN_DATA": HKEY_DYN_DATA
}
def parse_key(key):
key = key.upper()
parts = key.split('\\')
root_hive_name = parts[0]
root_hive = roots_hives.get(root_hive_name)
partial_key = '\\'.join(parts[1:])
if not root_hive:
raise Exception('root hive "{}" was not found'.format(root_hive_name))
return partial_key, root_hive
def get_sub_keys(key):
partial_key, root_hive = parse_key(key)
with ConnectRegistry(None, root_hive) as reg:
with OpenKey(reg, partial_key) as key_object:
sub_keys_count, values_count, last_modified = QueryInfoKey(key_object)
try:
for i in range(sub_keys_count):
sub_key_name = EnumKey(key_object, i)
yield sub_key_name
except WindowsError:
pass
def get_values(key, fields):
partial_key, root_hive = parse_key(key)
with ConnectRegistry(None, root_hive) as reg:
with OpenKey(reg, partial_key) as key_object:
data = {}
for field in fields:
try:
value, type = QueryValueEx(key_object, field)
data[field] = value
except WindowsError:
pass
return data
def get_value(key, field):
values = get_values(key, [field])
return values.get(field)
def join(path, *paths):
path = path.strip('/\\')
paths = map(lambda x: x.strip('/\\'), paths)
paths = list(paths)
result = os.path.join(path, *paths)
result = result.replace('/', '\\')
return result
Made a simple program to produce a list of the subkeys in the registry. Currently, trying to figure out how to search that list for specific subkeys and then add them if they are not there, and/or change the value. (shouldn't be too difficult)
This is what I came up with. I hope it helps, also feel free to critique it:
from winreg import *
registry = ConnectRegistry(None, HKEY_LOCAL_MACHINE)
def openRegistryA():
rawKeyA = OpenKey(registry, "SOFTWARE\Microsoft\Windows\CurrentVersion\Policies\System")
try:
i = 0
while 1:
name, value, type = EnumValue(rawKeyA, i)
print(name, value, i)
i += 1
except WindowsError:
print("END")
CloseKey(rawKeyA)
def openRegistryB():
rawKeyB = OpenKey(registry, "SYSTEM\CurrentControlSet\Services\LanmanServer\Parameters")
try:
i = 0
while 1:
name, value, type = EnumValue(rawKeyB, i)
print(name, value, i)
i += 1
except WindowsError:
print("END")
CloseKey(rawKeyB)
openRegistryA()
openRegistryB()

except: IndentationError: unindent does not match outer indentation level

when I compile this code I get File "", line 60
except:
^
IndentationError: unindent does not match any outer indentation level
what is the solution for this?
def readPassword():
print(':')
path='./wifipasswords.txt'#
file=open(path,'r')#
while True:
try:
pad=file.readline()#
bool=wifiConnect(pad)#,
if bool:
print(': ',pad)
print('wifi!!!')
break
else:
print('...',pad)
print('\n ')
except:
continue
#
readPassword()
you had indentation error at except.
import itertools as its
words='1234567890'#
r=its.product(words,repeat=3)#,,repeat=5
dic=open('./wifipasswords.txt','a')#,,
for i in r:
dic.write(''.join(i))
dic.write(''.join('\n'))
print(i)
dic.close()
print('')
import pywifi
from pywifi import const
import time
#,
def wifiConnect(pwd):
wifi=pywifi.PyWiFi()#
ifaces=wifi.interfaces()[0]#
iface.disconnect()#
time.sleep(1)#
wifistatus=ifces.status()#
if wifistatus==const.IFACE_DISCONNECTED:#wifi
profile=pywifi.Profile()#wifi
profile.ssid='gebilaowang'#wifi
profile.auth=const.AUTH_ALG_OPEN#
profile.akm.append(const.AKM_TYPE_WPA2PSK)#wifi,wifiwps
profile.clipher=const.CIPHER_TYPE_CCMP#
profile.key=pwd#
ifaces.remove_all_network_profiles()#
#
tep_profile=ifaces.add_network_profile(profile)
ifaces.connect(tep_profile)
time.sleep(3)#,wifi,
if ifaces.status()==const.IFACE_CONNECTED:#
return True
else:
return False
else:
print('wifi,')
#
def readPassword():
print(':')
path='./wifipasswords.txt'#
file=open(path,'r')#
while True:
try:
pad=file.readline()#
bool=wifiConnect(pad)#,
if bool:
print(': ',pad)
print('wifi!!!')
break
else:
print('...',pad)
print('\n ')
except: # HERE INDENTATION
continue
#
readPassword()
The traceback clearly defines your error:
IndentationError: unindent does not match any outer indentation level
The except block must be aligned in python. In python, indentation matters. Also, all imports should be at the top, and you should parts of your code with a whitespace. (dic = open('./wifipasswords.txt', 'a') vs dic=open('./wifipasswords.txt', 'a')) I recommend installing a formatter like autopep8 to avoid these issues. Your updated code should look like this:
import itertools as its
import pywifi
from pywifi import const
import time
words = '1234567890' #
r = its.product(words, repeat=3) # ,,repeat=5
dic = open('./wifipasswords.txt', 'a') # ,,
for i in r:
dic.write(''.join(i))
dic.write(''.join('\n'))
print(i)
dic.close()
print('')
# ,
def wifiConnect(pwd):
wifi = pywifi.PyWiFi() #
ifaces = wifi.interfaces()[0] #
iface.disconnect() #
time.sleep(1) #
wifistatus = ifces.status() #
if wifistatus == const.IFACE_DISCONNECTED: # wifi
profile = pywifi.Profile() # wifi
profile.ssid = 'gebilaowang' # wifi
profile.auth = const.AUTH_ALG_OPEN #
profile.akm.append(const.AKM_TYPE_WPA2PSK) # wifi,wifiwps
profile.clipher = const.CIPHER_TYPE_CCMP #
profile.key = pwd #
ifaces.remove_all_network_profiles() #
#
tep_profile = ifaces.add_network_profile(profile)
ifaces.connect(tep_profile)
time.sleep(3) # ,wifi,
if ifaces.status() == const.IFACE_CONNECTED: #
return True
else:
return False
else:
print('wifi,')
#
def readPassword():
print(':')
path = './wifipasswords.txt' #
file = open(path, 'r') #
while True:
try:
pad = file.readline() #
bool = wifiConnect(pad) # ,
if bool:
print(': ', pad)
print('wifi!!!')
break
else:
print('...', pad)
print('\n ')
except:
continue
#
readPassword()

why the process not run in python

i have a bot(query, key) function to post data, dicts(query, answer) to wrap the return result, and query_pipe(query_list) to process list of query request. But when i put that in multiprocessing.Process, i found that bot(query, key) return nothing. Here's my code.
def bot(query, key):
data = {
'key' : key,
'info' : query,
'userid' : 'wechat-robot',
}
try:
apiUrl = url
page = requests.post(apiUrl, data=data)
if page.json()['code'] == '100000':
answer = page.json()['text']
return dicts(query, answer)
else:
return dicts(query, 'failed')
except Exception as e:
return '500 Error'
def dicts(query, answer):
return {'query': query, 'answer': answer}
def query_pipe(query_list):
keys_pool = []
with open('keys.txt', 'r') as f:
lines = f.readlines()
for line in lines:
keys_pool.append(line.strip('\n'))
idx = 0
print(bot(query_list[0], keys_pool[0]))
p = Process(target=query_pipe, args=(query_data,))
p.start()
p.join()
But when i run the query_pipe(query_list) which not using multiprocess.Process, query_pipe(query_list) would print the correct output. I feel so confused, so anyone could give me a hint would be highly appreciated.

Fixing faulty unicode strings

A faulty unicode string is one that has accidentally encoded bytes in it.
For example:
Text: שלום, Windows-1255-encoded: \x99\x8c\x85\x8d, Unicode: u'\u05e9\u05dc\u05d5\u05dd', Faulty Unicode: u'\x99\x8c\x85\x8d'
I sometimes bump into such strings when parsing ID3 tags in MP3 files. How can I fix these strings? (e.g. convert u'\x99\x8c\x85\x8d' into u'\u05e9\u05dc\u05d5\u05dd')
You could convert u'\x99\x8c\x85\x8d' to '\x99\x8c\x85\x8d' using the latin-1 encoding:
In [9]: x = u'\x99\x8c\x85\x8d'
In [10]: x.encode('latin-1')
Out[10]: '\x99\x8c\x85\x8d'
However, it seems like this is not a valid Windows-1255-encoded string. Did you perhaps mean '\xf9\xec\xe5\xed'? If so, then
In [22]: x = u'\xf9\xec\xe5\xed'
In [23]: x.encode('latin-1').decode('cp1255')
Out[23]: u'\u05e9\u05dc\u05d5\u05dd'
converts u'\xf9\xec\xe5\xed' to u'\u05e9\u05dc\u05d5\u05dd' which matches the desired unicode you posted.
If you really want to convert u'\x99\x8c\x85\x8d' into u'\u05e9\u05dc\u05d5\u05dd', then this happens to work:
In [27]: u'\x99\x8c\x85\x8d'.encode('latin-1').decode('cp862')
Out[27]: u'\u05e9\u05dc\u05d5\u05dd'
The above encoding/decoding chain was found using this script:
guess_chain_encodings.py
"""
Usage example: guess_chain_encodings.py "u'баба'" "u'\xe1\xe0\xe1\xe0'"
"""
import six
import argparse
import binascii
import zlib
import utils_string as us
import ast
import collections
import itertools
import random
encodings = us.all_encodings()
Errors = (IOError, UnicodeEncodeError, UnicodeError, LookupError,
TypeError, ValueError, binascii.Error, zlib.error)
def breadth_first_search(text, all = False):
seen = set()
tasks = collections.deque()
tasks.append(([], text))
while tasks:
encs, text = tasks.popleft()
for enc, newtext in candidates(text):
if repr(newtext) not in seen:
if not all:
seen.add(repr(newtext))
newtask = encs+[enc], newtext
tasks.append(newtask)
yield newtask
def candidates(text):
f = text.encode if isinstance(text, six.text_type) else text.decode
results = []
for enc in encodings:
try:
results.append((enc, f(enc)))
except Errors as err:
pass
random.shuffle(results)
for r in results:
yield r
def fmt(encs, text):
encode_decode = itertools.cycle(['encode', 'decode'])
if not isinstance(text, six.text_type):
next(encode_decode)
chain = '.'.join( "{f}('{e}')".format(f = func, e = enc)
for enc, func in zip(encs, encode_decode) )
return '{t!r}.{c}'.format(t = text, c = chain)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('start', type = ast.literal_eval, help = 'starting unicode')
parser.add_argument('stop', type = ast.literal_eval, help = 'ending unicode')
parser.add_argument('--all', '-a', action = 'store_true')
args = parser.parse_args()
min_len = None
for encs, text in breadth_first_search(args.start, args.all):
if min_len is not None and len(encs) > min_len:
break
if type(text) == type(args.stop) and text == args.stop:
print(fmt(encs, args.start))
min_len = len(encs)
if __name__ == '__main__':
main()
Running
% guess_chain_encodings.py "u'\x99\x8c\x85\x8d'" "u'\u05e9\u05dc\u05d5\u05dd'" --all
yields
u'\x99\x8c\x85\x8d'.encode('latin_1').decode('cp862')
u'\x99\x8c\x85\x8d'.encode('charmap').decode('cp862')
u'\x99\x8c\x85\x8d'.encode('rot_13').decode('cp856')
etc.

PyPDF Merge and Write issue

I am getting an unexpected error when using this. The first section is from a script that I found online, and I am trying to use it to pull a particular section identified in the PDF's outline. Everything works fine, except right at output.write(outputfile1) it says:
PdfReadError: multiple definitions in dictionary.
Anybody else run into this? Please forgive all the unnecessary prints at the end. :)
import pyPdf
import glob
class Darrell(pyPdf.PdfFileReader):
def getDestinationPageNumbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, pyPdf.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.iteritems():
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result
for fileName in glob.glob("*.pdf"):
output = pyPdf.PdfFileWriter()
print fileName
pdf = Darrell(open(fileName, 'rb'))
template = '%-5s %s'
print template % ('page', 'title')
for p,t in sorted([(v,k) for k,v in pdf.getDestinationPageNumbers().iteritems()]):
print template % (p+1,t)
for p,t in sorted([(v,k) for k,v in pdf.getDestinationPageNumbers().iteritems()]):
if t == "CATEGORY 1":
startpg = p+1
print p+1,'is the first page of Category 1.'
if t == "CATEGORY 2":
endpg = p+1
print p+1,'is the last page of Category 1.'
print startpg, endpg
pagenums = range(startpg,endpg)
print pagenums
for i in pagenums:
output.addPage(pdf.getPage(i))
fileName2 = "%sCategory1_data.pdf" % (str(fileName[:-13]))
print "%s has %s pages." % (fileName2,output.getNumPages())
outputfile1 = file(r"%s" % (fileName2), 'wb')
output.write(outputfile1)
outputfile1.close()
I know it might be too late for you, but for anyone else who will stumble here to look for the answer:
I had the same problem today, setting:
export_reader = PdfFileReader(filename, strict=False)
If you are just merging, then use:
merger = PdfFileMerger(strict=False)
This way, you will get only a warning, rather than an exception.

Categories

Resources