My function reads multiple .sgm files. I get an error when reading the content from the file speficially at line contents = f.read()
def block_reader(path):
filePaths = []
for filename in os.listdir(path):
if filename.endswith(".sgm"):
filePaths.append(os.path.join(path, filename))
continue
else:
continue
for file in filePaths:
with open(file, 'r') as f:
print(f)
contents = f.read()
soup = BeautifulSoup(contents, "lxml")
return ["test content"]
Error message
Traceback (most recent call last):
File "./block-1-reader.py", line 32, in <module>
for reuters_file_content in solutions.block_reader(path):
File "/home/ragith/Documents/A-School/Fall-2020/COMP_479/Assignment_1/solutions.py", line 29, in block_reader
contents = f.read()
File "/usr/lib/python3.6/codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfc in position 1519554: invalid start byte
Try this: with open(path, 'rb') as f: That b in the mode specifier in the open() states that the file shall be treated as binary, so contents will remain a bytes. No decoding attempt will happen this way. More details at: this link
Related
I've been trying to iterate through a csv file with the following code:
`
import csv
import os, sys
directory = "/Users/aliharam/Desktop/Lamis File"
files = []
for filename in os.listdir(directory):
f = os.path.join(directory, filename)
# checking if it is a file
if os.path.isfile(f):
files.append(f)
files.pop()
for i in files:
with open(i, 'r') as csvfile:
datareader = csv.reader(csvfile)
for row in datareader:
print(row)
`
This is the error I am getting:
Traceback (most recent call last):
File "/Users/aliharam/PycharmProjects/LamisTasks/Normalization.py", line 16, in <module>
for row in datareader:
File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xbf in position 355: invalid start byte
['\tAli Haram \tAli Haram ']
Process finished with exit code 1
How do I fix this?!!
I tried using
dataset = pd.read_csv(i, header= 0,
encoding= 'unicode_escape')
and
with io.open(filename, 'r', encoding='utf-8') as fn:
lines = fn.readlines()
both didn't work
The file your program reads contains character(at position 355) which does not belong to Unicode.
If we assume you are reading a Unicode encoded file, then there is an error in your data file. First you need to make sure the file your program reads is encoded in Unicode or not.
I want to define a function that can be implemented on each xml file in the directory in order to parse it and get the content from the tags in a dataframe.
from xml.etree import ElementTree
def func(path, filename):
for filename in os.listdir(path):
with open(os.path.join(path, filename)) as file:
# Read each line in the file, readlines() returns a list of lines
content = file.readlines()
# Combine the lines in the list into a string
content = "".join(content)
bs_content = bs(content, "lxml")
headline = bs_content.find_all("headline")
eventtitle = bs_content.find_all("eventtitle")
city = bs_content.find_all("city")
companyname = bs_content.find_all("companyname")
companyticker = bs_content.find_all("companyticker")
startdate = bs_content.find_all("startdate")
eventstory = bs_content.find_all("eventstory")
data = []
for i in range(0,len(companyname)):
rows = [companyname[i].get_text(),headline[i].get_text(),
city[i].get_text(),eventtitle[i].get_text(),
companyticker[i].get_text(),startdate[i].get_text(),
eventstory[i].get_text()]
data.append(rows)
df = pd.DataFrame(data,columns = ['companyname','headline',
'city','eventtitle','companyticker',
'startdate','eventstory'], dtype = float)
When I call a function I receive this error. Unfortunately, every existing solutions didn't not work for me.
func('./Calls/', '1000015_T.xml')
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
Input In [58], in <module>
----> 1 func('./Calls/', '1000015_T.xml')
Input In [57], in func(path, filename)
7 for filename in os.listdir(path):
8 with open(os.path.join(path, filename)) as file:
9 # Read each line in the file, readlines() returns a list of lines
---> 10 content = file.readlines()
11 # Combine the lines in the list into a string
12 content = "".join(content)
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/codecs.py:322, in BufferedIncrementalDecoder.decode(self, input, final)
319 def decode(self, input, final=False):
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte
Maybe you can also help me with code optimization.
My task is to get the content of 2k xml files and so far I decided to define a function and then to use pandarallel: parallel_apply(func)
The input file is not UTF-8, it is likely some other code page.
Determine what the correct encoding is and alter your program accordingly.
I've been trying to read a .dat file from an Agilent impedance analyzer. I keep getting the same error regardless of the method I try. Any ideas how to get around this issue?
Thanks in advance.
# import csv
# Method 1
# with open("RP.dat") as infile, open("outfile.csv", "w") as outfile:
# csv_writer = csv.writer(outfile)
# prev = ''
# csv_writer.writerow(['ID', 'PARENT_ID'])
# for line in infile.read().splitlines():
# csv_writer.writerow([line, prev])
# prev = line
# Method 2
# import numpy as np
# filename = 'RP.dat'
# indata = np.loadtxt(filename)
# print(indata)
# Method 3
with open("RP.dat") as infile:
file_contents = infile.readlines()
print(file_contents)
C:\Users\benjy\Workspace\urop>python read_dat.py
Traceback (most recent call last):
File "C:\Users\benjy\Workspace\urop\read_dat.py", line 17, in <module>
file_contents = infile.readlines()
File "C:\Users\benjy\AppData\Local\Programs\Python\Python39\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 672: character maps to <undefined>
You can use codecs library
import codecs
with codecs.open('RP.dat', errors='ignore', encoding='utf-8') as f:
dat = f.read()
I am reading a songs file in csv format and I do not know what I am doing wrong.
import csv
import os
import random
file = open("songs.csv", "rU")
reader = csv.reader(file)
for song in reader:
print(song[0], song[1], song[2])
file.close()
This is the error:
Traceback (most recent call last):
File "/Users/kuku/Desktop/hey/mine/test.py", line 10, in <module>
for song in reader:
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/encodings/ascii.py", line 26, in decode
return codecs.ascii_decode(input, self.errors)[0]
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 414: ordinal not in range(128)
try
for song in [unicode(song, 'utf-8') for song in reader]:
print(...)
With this bit of your code:
for song in reader:
print( song[0], song[1],song[2])
you are printing elements 0, 1 and 2 of the lines in reader during each iteration of the loop. This will cause a (different) error if there are fewer than 3 elements in total.
If you don't know that there will be at least 3 elements in each line, you could include the code in a try, except block:
with open("songs.csv", "r") as f:
song_reader = csv.reader(f)
for song_line in song_reader:
lyric = song_line
try:
print(lyric[0], lyric[1], lyric[2])
except:
pass # ...or preferably do something better
It's worth noting that in most cases it is preferable to open a file within a with block, as shown above. This negates the need for file.close().
You can open the file in utf-8 encoding.
file = open("songs.csv", "rU", encoding="utf-8")
I am trying to open and readlines a .txt file that contains a large amount of text. Below is my code, i dont know how to solve this problem. Any help would be very appreciated.
file = input("Please enter a .txt file: ")
myfile = open(file)
x = myfile.readlines()
print (x)
when i enter the .txt file this is the full error message is displayed below:
line 10, in <module> x = myfile.readlines()
line 26, in decode return codecs.ascii_decode(input, self.errors)[0]
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 318: ordinal not in range(128)
Instead of using codecs, I solve it this way:
def test():
path = './test.log'
file = open(path, 'r+', encoding='utf-8')
while True:
lines = file.readlines()
if not lines:
break
for line in lines:
print(line)
You must give encoding param precisely.
You can also try to encode :
with open(file) as f:
for line in f:
line = line.encode('ascii','ignore').decode('UTF-8','ignore')
print(line)
#AndriiAbramamov is right, your shoud check that question, here is a way you can open your file which is also on that link
import codecs
f = codecs.open('words.txt', 'r', 'UTF-8')
for line in f:
print(line)
Another way is to use regex, so when you open the file you can remove any special character like double quotes and so on.