UnicodeEncodeError: 'latin-1' codec can't encode character '\u2019' - python

I'm dealing with this error I don't know what's the problem I have already added the font Arial and I've also tried to use the function Unicode() but nothing the same error appears
UnicodeEncodeError: 'latin-1' codec can't encode character '\u2019' in position 59: ordinal not in range(256)
p.s: I'm using mac os
and this is the code
import numpy as np
import pandas as pd
from fpdf import FPDF
from PyPDF2 import *
from time import *
import matplotlib.pyplot as plt
#function to convert minutes to date string
def Minutes_To_Date(date_minutes):
if date_minutes>0:
Secondes = int(date_minutes*60)
else :
Secondes = 0
series_format=strftime('%H:%M:%S', gmtime(Secondes))
return series_format
#function to make the data frame into a table in pdf
def output_df_to_pdf(pdf,df):
table_cell_width = 25 # set the height of the cell and the width
table_cell_height = 6
pdf.add_font("Arial", "", "arial.ttf", uni=True)
pdf.set_font("Arial", "B", 8)
cols = df.columns
#type the first row (name of columns)
for col in cols:
pdf.cell(table_cell_width, table_cell_height, col, align = 'C', border = 1)
pdf.ln(table_cell_height)
pdf.set_font("Arial", "", 10)
#type the rows
for row in df.itertuples():
for col in cols :
value = str(getattr(row,col))
pdf.cell(table_cell_width, table_cell_height, value, align = "C", border=1)
pdf.ln(table_cell_height)
#the path of our data
path = r"/Users/mac/Desktop/data_test.xlsx"
#load the data into data frame
df = pd.read_excel(path, sheet_name='Feuil1')
#add the hours columns to the dataframe
df["heure"] = (df["Date dernier envoi"].dt.hour)
#add the "delai de validation " columns to dataframe
df["Délai de validation"] = (df["Date action"] - df["Date dernier envoi"])/np.timedelta64(1, 'm') #calculate period in minutes
#create 2 pivot table one to be seen and the other to make graphs
df_pivot = pd.pivot_table(df, values="Délai de validation", index="heure", columns="Nom Service", aggfunc = "mean" , margins=True)
df_pivot_seen = pd.pivot_table(df, values="Délai de validation", index="heure", columns="Nom Service", aggfunc = "mean")
date_minutes_1 = list(df_pivot_seen["AMPE"]) #convert the data frame column to list
date_minutes_2 = list(df_pivot_seen["AMPI"])
#convert the number of minutes to string in form of date
for i in range(len(date_minutes_1)):
date_minutes_1[i] = Minutes_To_Date(date_minutes_1[i])
date_minutes_2[i] = Minutes_To_Date(date_minutes_2[i])
#convert to data frame
df_pivot_seen["AMPE"] = pd.DataFrame(date_minutes_1)
df_pivot_seen["AMPI"] = pd.DataFrame(date_minutes_2)
#create a diagram
df_pivot.plot()
plt.savefig("chart.png")
#create fpdf object with default values page:A4 and mesure will be in millimeters
pdf = FPDF()
pdf.add_font("Arial", "", "arial.ttf", uni=True)
pdf.add_page()
pdf.set_font("Arial","B", 16)
pdf.cell(40,10,"Rapport d’activité Import/Export de la semaine S52")
pdf.ln(20)
pdf.image("chart.png")
pdf.ln(20)
output_df_to_pdf(pdf, df_pivot_seen)
pdf.output("/Users/mac/Desktop/report.pdf", "F")
print(df_pivot_seen)
UnicodeEncodeError Traceback (most recent call last)
/var/folders/yv/yjnc8p5j64s70dv3dfh9td600000gn/T/ipykernel_76346/884241888.py in <module>
83
84
---> 85 pdf.output("/Users/mac/Desktop/report.pdf")
86
87
~/opt/anaconda3/lib/python3.9/site-packages/fpdf/fpdf.py in output(self, name, dest)
1063 #Finish document if necessary
1064 if(self.state<3):
-> 1065 self.close()
1066 dest=dest.upper()
1067 if(dest==''):
~/opt/anaconda3/lib/python3.9/site-packages/fpdf/fpdf.py in close(self)
244 self._endpage()
245 #close document
--> 246 self._enddoc()
247
248 def add_page(self, orientation=''):
~/opt/anaconda3/lib/python3.9/site-packages/fpdf/fpdf.py in _enddoc(self)
1634 def _enddoc(self):
1635 self._putheader()
-> 1636 self._putpages()
1637 self._putresources()
1638 #Info
~/opt/anaconda3/lib/python3.9/site-packages/fpdf/fpdf.py in _putpages(self)
1168 if self.compress:
1169 # manage binary data as latin1 until PEP461 or similar is implemented
-> 1170 p = self.pages[n].encode("latin1") if PY3K else self.pages[n]
1171 p = zlib.compress(p)
1172 else:
UnicodeEncodeError: 'latin-1' codec can't encode character '\u2019' in position 59: ordinal not in range(256)

I solved this by changing the font. The original font (Arial only allowed latin-1.

Related

PyPDF2 Font Read Issue

I'm writing a script to automate extracting data from pdfs I receive. I'm using PyPDF2 to read the pdfs and extract the text to be interpreted. I've tested pdfs with two different formats. The script works perfectly for the first format. When trying it with the second format I'm getting an indexing error (below). After troubleshooting I've found the issue is due to the font used in the second format. They use "Roboto" while the first, successful format, uses Arial.
I've attached stripped-down versions of the pdfs that are causing issues. One in Roboto and one I manually changed to Arial.
https://drive.google.com/drive/folders/1BhaXPfNyLx8euR2dPQaTqdHvtYJg8yEh?usp=sharing
The snippet of code here is where I'm running into the issue:
import PyPDF2
pdf_roboto = r"C:\Users\Robert.Smyth\Python\test_pdf_roboto.pdf"
pdf_arial = r"C:\Users\Robert.Smyth\Python\test_pdf_arial.pdf"
reader = PyPDF2.PdfFileReader(pdf_roboto)
pageObj = reader.pages[0]
pages_text = pageObj.extractText()
The indexing error I'm getting is:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
C:\Users\ROBERT~1.SMY\AppData\Local\Temp/ipykernel_22076/669450932.py in <module>
1 reader = PyPDF2.PdfFileReader(pdf_roboto)
2 pageObj = reader.pages[0]
----> 3 pages_text = pageObj.extractText()
~\Anaconda3\lib\site-packages\PyPDF2\_page.py in extractText(self, Tj_sep, TJ_sep)
1539 """
1540 deprecate_with_replacement("extractText", "extract_text")
-> 1541 return self.extract_text()
1542
1543 def _get_fonts(self) -> Tuple[Set[str], Set[str]]:
~\Anaconda3\lib\site-packages\PyPDF2\_page.py in extract_text(self, Tj_sep, TJ_sep, orientations, space_width, *args)
1511 orientations = (orientations,)
1512
-> 1513 return self._extract_text(
1514 self, self.pdf, orientations, space_width, PG.CONTENTS
1515 )
~\Anaconda3\lib\site-packages\PyPDF2\_page.py in _extract_text(self, obj, pdf, orientations, space_width, content_key)
1144 if "/Font" in resources_dict:
1145 for f in cast(DictionaryObject, resources_dict["/Font"]):
-> 1146 cmaps[f] = build_char_map(f, space_width, obj)
1147 cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str] = (
1148 "charmap",
~\Anaconda3\lib\site-packages\PyPDF2\_cmap.py in build_char_map(font_name, space_width, obj)
20 space_code = 32
21 encoding, space_code = parse_encoding(ft, space_code)
---> 22 map_dict, space_code, int_entry = parse_to_unicode(ft, space_code)
23
24 # encoding can be either a string for decode (on 1,2 or a variable number of bytes) of a char table (for 1 byte only for me)
~\Anaconda3\lib\site-packages\PyPDF2\_cmap.py in parse_to_unicode(ft, space_code)
187 cm = prepare_cm(ft)
188 for l in cm.split(b"\n"):
--> 189 process_rg, process_char = process_cm_line(
190 l.strip(b" "), process_rg, process_char, map_dict, int_entry
191 )
~\Anaconda3\lib\site-packages\PyPDF2\_cmap.py in process_cm_line(l, process_rg, process_char, map_dict, int_entry)
247 process_char = False
248 elif process_rg:
--> 249 parse_bfrange(l, map_dict, int_entry)
250 elif process_char:
251 parse_bfchar(l, map_dict, int_entry)
~\Anaconda3\lib\site-packages\PyPDF2\_cmap.py in parse_bfrange(l, map_dict, int_entry)
256 lst = [x for x in l.split(b" ") if x]
257 a = int(lst[0], 16)
--> 258 b = int(lst[1], 16)
259 nbi = len(lst[0])
260 map_dict[-1] = nbi // 2
IndexError: list index out of range
I've found that if I use the exact same pdf and all I change is the font from Roboto to Arial, PyPDF2 has no problem extracting the text. I've searched online and in the PyPDF2 documentation but I can't find any solution on how to get it to extract text in the Roboto font, or add the Roboto font to the PyPDF2 font library.
I'd really appreciate if anyone could provide some advice on how to solve this issue.
Note: manually changing the font from Roboto to Arial isn't a desirable option as I receive hundreds of these invoices monthly.

'gbk' codec can't encode character '\u2022' in position 32: illegal multibyte sequence

There is a question about writting file.
when I used data.to_csv('/home/bio_kang/Learning/Python/film_project/top250_film_info.csv', index=None, encoding='gbk'), it given me a error that 'gbk' codec can't encode character '\u2022' in position 32: illegal multibyte sequence.
The data come from a website https://movie.douban.com/top250. I use requests , beautifulsoup and re to get them form the website.
And, here is my part code:
uni_num = []
years = []
countries = []
directors = []
actors = []
descriptions = []
for i in range(250):
with open('/home/bio_kang/Learning/Python/film_project/film_info/film_{}.html'.format(i), 'rb') as f:
film_info = f.read().decode('utf-8','ignore')
pattern_uni_num = re.compile(r'<span class="pl">IMDb:</span> (.*?)<br/>') # unique number
pattern_year = re.compile(r'<span class="year">\((.*?)\)</span>') # year
pattern_country = re.compile(r'<span class="pl">制片国家/地区:</span>(.*?)<br/>') # country
pattern_director = re.compile(r'<meta content=(.*?) property="video:director"/>') # director
pattern_actor = re.compile(r'<meta content="(.*?)" property="video:actor"/>') # actors
pattern_description = re.compile(r'<meta content="(.*?)property="og:description">') # description
uni_num.append(str(re.findall(pattern_uni_num, film_info)).strip("[]").strip("'"))
years.append(str(re.findall(pattern_year, film_info)).strip("[]").strip("'"))
countries.append(str(re.findall(pattern_country, film_info)).strip("[]").strip("'").split('/')[0])
directors.append(re.findall(pattern_director, film_info))
actors.append(re.findall(pattern_actor, film_info))
descriptions.append(str(re.findall(pattern_description, film_info)).strip('[]').strip('\''))
raw_data = {'encoding':uni_num, 'name':names, 'description':descriptions, 'country':countries, 'director':new_director, 'actor':new_actor, 'vote':new_votes, 'score':scores, 'year':years, 'link':urls }
data = pd.DataFrame(raw_data)
data.to_csv('/home/bio_kang/Learning/Python/film_project/top250_film_info.csv', index=None, encoding='gbk')
try that:
open('...','rb',encoding='utf-8')
or utf-16

Why I have this problem with index range?why does it not work?

I have got this error when try split my one column to few columns. But it split on just on one or two columns.If you wanna split on 3,4,5 columns it writes:
ValueError Traceback (most recent call last)
/usr/local/Cellar/jupyterlab/2.1.5/libexec/lib/python3.8/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
349 try:
--> 350 return self._range.index(new_key)
351 except ValueError:
ValueError: 2 is not in range
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-19-d4e6a4d03e69> in <module>
22 data_old[Col_1_Label] = newz[0]
23 data_old[Col_2_Label] = newz[1]
---> 24 data_old[Col_3_Label] = newz[2]
25 #data_old[Col_4_Label] = newz[3]
26 #data_old[Col_5_Label] = newz[4]
/usr/local/Cellar/jupyterlab/2.1.5/libexec/lib/python3.8/site-packages/pandas/core/frame.py in __getitem__(self, key)
2798 if self.columns.nlevels > 1:
2799 return self._getitem_multilevel(key)
-> 2800 indexer = self.columns.get_loc(key)
2801 if is_integer(indexer):
2802 indexer = [indexer]
/usr/local/Cellar/jupyterlab/2.1.5/libexec/lib/python3.8/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
350 return self._range.index(new_key)
351 except ValueError:
--> 352 raise KeyError(key)
353 return super().get_loc(key, method=method, tolerance=tolerance)
354
KeyError: 2
There is my code.I have csv file.And when pandas read it - create one column with value 'Контракт'.Then. I split it on another columns. But it split till two columns.I wanna 7 columns!Help please to understand this logic!
import pandas as pd
from pandas import Series, DataFrame
import re
dframe1 = pd.read_csv('po.csv')
columns = ['Контракт']
data_old = pd.read_csv('po.csv', header=None, names=columns)
data_old
# The thing you want to split the column on
SplitOn = ':'
# Name of Column you want to split
Split_Col = 'Контракт'
newz = data_old[Split_Col].str.split(pat=SplitOn, n=-1, expand=True)
# Column Labels (you can add more if you will have more)
Col_1_Label = 'Номер телефону'
Col_2_Label = 'Тарифний пакет'
Col_3_Label = 'Вихідні дзвінки з України за кордон'
Col_4_Label = 'ВАРТІСТЬ ПАКЕТА/ЩОМІСЯЧНА ПЛАТА'
Col_5_Label = 'ЗАМОВЛЕНІ ДОДАТКОВІ ПОСЛУГИ ЗА МЕЖАМИ ПАКЕТА'
Col_6_Label = 'Вартість послуги "Корпоративна мережа'
Col_7_Label = 'ЗАГАЛОМ ЗА КОНТРАКТОМ (БЕЗ ПДВ ТА ПФ)'
data_old[Col_1_Label] = newz[0]
data_old[Col_2_Label] = newz[1]
data_old[Col_3_Label] = newz[2]
#data_old[Col_4_Label] = newz[3]
#data_old[Col_5_Label] = newz[4]
#data_old[Col_6_Label] = newz[5]
#data_old[Col_7_Label] = newz[6]
data_old
Pandas does not support "unstructured text", you should convert it to a standard format or python objects and then create a dataframe from it
Imagine that you have a file with this text named data.txt:
Contract № 12345679 Number of phone: +7984563774
Total price for month : 00.00000
Total price: 10.0000
You can load an process it with Python like this:
with open('data.txt') as f:
content = list(data.readlines())
# First line contains the contract number and phone information
contract, phone = content[0].split(':')
# find contract number using regex
contract = re.findall('\d+', contract)[0]
# The phone is strightforward
phone = phone.strip()
# Second line and third line for prices
total_price = float(content[1].split(':')[1].strip())
total_month_price = float(content[2].split(':')[1].strip())
Then with these variables you can create a dataframe
df = pd.DataFrame([dict(N_of_contract=contract, total_price=total_price, total_month_price =total_month_price )])
Repeat the same for all files.

Error when using google translate API to translate a dataframe

I'm trying to translate part of SQuAD 1.1 dataset to Sinhalese. I don't know whether i can use the json file straight into translation
What i tried so far is making a little dataframe of SQuAD dataset and try to translate that as a demo to myself. But i got different errors. Below is the error i'm getting now. Can you help me to fix that error or tell me a better way to complete my task using python.
```import googletrans
from googletrans import Translator
import os
from google.cloud import translate_v2 as translate
os.environ['GOOGLE_APPLICATION_CREDENTIALS']=r"C:\Users\Sathsara\Documents\Python Learning\Translation test\translationAPI\flash-medley-278816-b2012b874797.json"
# create a translator object
translator = Translator()
# use translate method to translate a string - by default, the destination language is english
translated = translator.translate('I am Sathsara Rasantha',dest='si')
# the translate method returns an object
print(translated)
# obtain translated string by using attribute .text
translated.text
import pandas as pd
translate_example = pd.read_json("example2.json")
translate_example
contexts = []
questions = []
answers_text = []
answers_start = []
for i in range(translate_example.shape[0]):
topic = translate_example.iloc[i,0]['paragraphs']
for sub_para in topic:
for q_a in sub_para['qas']:
questions.append(q_a['question'])
answers_start.append(q_a['answers'][0]['answer_start'])
answers_text.append(q_a['answers'][0]['text'])
contexts.append(sub_para['context'])
df = pd.DataFrame({"context":contexts, "question": questions, "answer_start": answers_start, "text": answers_text})
df
df=df.loc[0:2,:]
df
# make a deep copy of the data frame
df_si = df.copy()
# translate columns' name using rename function
df_si.rename(columns=lambda x: translator.translate(x).text, inplace=True)
df_si.columns
translations = {}
for column in df_si.columns:
# unique elements of the column
unique_elements = df_si[column].unique()
for element in unique_elements:
# add translation to the dictionary
translations[element] = translator.translate(element,dest='si').text
print(translations)
# modify all the terms of the data frame by using the previously created dictionary
df_si.replace(translations, inplace = True)
# check translation
df_si.head()```
This is the error i get
> --------------------------------------------------------------------------- TypeError Traceback (most recent call
> last) <ipython-input-24-f55a5ca59c36> in <module>
> 5 for element in unique_elements:
> 6 # add translation to the dictionary
> ----> 7 translations[element] = translator.translate(element,dest='si').text
> 8
> 9 print(translations)
>
> ~\Anaconda3\lib\site-packages\googletrans\client.py in translate(self,
> text, dest, src)
> 170
> 171 origin = text
> --> 172 data = self._translate(text, dest, src)
> 173
> 174 # this code will be updated when the format is changed.
>
> ~\Anaconda3\lib\site-packages\googletrans\client.py in
> _translate(self, text, dest, src)
> 73 text = text.decode('utf-8')
> 74
> ---> 75 token = self.token_acquirer.do(text)
> 76 params = utils.build_params(query=text, src=src, dest=dest,
> 77 token=token)
>
> ~\Anaconda3\lib\site-packages\googletrans\gtoken.py in do(self, text)
> 199 def do(self, text):
> 200 self._update()
> --> 201 tk = self.acquire(text)
> 202 return tk
>
> ~\Anaconda3\lib\site-packages\googletrans\gtoken.py in acquire(self,
> text)
> 144 a = []
> 145 # Convert text to ints
> --> 146 for i in text:
> 147 val = ord(i)
> 148 if val < 0x10000:
>
> TypeError: 'numpy.int64' object is not iterable

Mean of an Array Created from a CSV in Python

I am trying to find the mean of an array created from data in a CSV file using Python. Data in the array is included between a range of values, so it does not include all the values in the column of the CSV. My current code that creates the array is shown below. Several arrays have been created, but I only need to find the mean of the array called "T07s". I am consistently getting the error "cannot perform reduce with flexible type" when using the function np.mean(T07s)
import csv
class dataPoint:
def __init__(self, V, T07, T19, T27, Time):
self.V = V
self.T07 = T07
self.T19 = T19
self.T27 = T27
self.Time = Time
dataPoints = []
with open("data_final.csv") as csvfile:
reader = csv.reader(csvfile)
next(reader)
for row in reader:
if 229 <= float(row[2]) <= 231:
temp = dataPoint(row[1], row[12], row[24], row[32], row[0].split(" ")[1])
dataPoints.append(temp)
T07s = np.array([x.T07 for x in dataPoints])
The data included in T07s is shown below:
for x in T07s:
print(x)
37.2
539
435.6
717.4
587
757.9
861.8
1024.2
325
117.9
136.3
167.8
809
405.3
405.1
112.7
1317.1
1731.8
1080.2
1208.6
1212.6
1363.8
1715.3
2376.4
2563.9
2998.4
2934.7
2862.4
390.8
2332.2
2121
2237.6
2334.1
2082.2
1892.1
1888.8
1960.6
1329.1
1657.2
2042.4
1417.5
977.3
1442.8
561.2
500.3
413.3
324.1
693.7
750
865.7
434.2
635.2
815.7
171.4
829.3
815.3
774.8
1411.6
1685.1
1345.1
1193.2
1674.9
1636.4
1389.8
753.3
1102.8
908.3
1223.2
1199.4
1040.7
1040.9
824.7
620
795.7
810.4
378.8
643.2
441.8
682.8
417.8
515.6
2354.7
1938.8
1512.4
1933.5
1739.8
2281.9
1997.5
2833.4
182.8
202.4
217.3
234.2
741.9
Clearly more of a simple solution:
import pandas as pd
data = pd.read_csv('data_final.csv')
data_filtered = data[data.iloc[:,2] >= 229 & data.iloc[:,2] <= 231]
print(data_filtered['T07'].mean())

Categories

Resources