Read xml as a txt in python - python

i have following code in python (which only load data from txt):
def main():
f = open("text.txt", "r" ) //load txt
a = [] // new array
for line in f:
a.append(line.strip()) //append line
main()
How can i do this with xml file? f = open("myxml.xml", "r" ) doesnt work. I get error : UnicodeDecodeError: 'charmap' codec can't decode byte 0x88 in position 4877: character maps to <undefined>

This has nothing to do with the xml file format, but in which encoding your file is. Python3 assumes everything to be in utf-8, but if you are on windows your file is probably in windows-1252. You should use:
f = open("text.txt", "r", encoding="cp1252")

this will sure do your job.
a=[]
with open('reboot.xml', 'r') as f:
a = f.read()
f.closed
print a

Related

Reading CSV file. UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 336: character maps to

Trying to read two CSV files based on a function but when reading one (yelp.csv) I encounter an error:
UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 336: character maps to
I tried the encoding but the error persists. I had identified the issue is when using .readlines(). Not sure how to fix this issue.
def readDataFromFile(fileName, seperator, encoding="utf8"):
with open(fileName, 'r') as panelf:
panelf.readline() # skip header
lines = []
data = panelf.readlines()
for line in data:
line = line.strip("\n").split(seperator)
lines.append(line)
return lines
panelData = readDataFromFile("Desktop/panel.csv", ",", encoding="utf-8")
yelpData = readDataFromFile("Desktop/yelp.csv", ",", encoding="utf-8")
The encoding variable is not used. Try:
def readDataFromFile(fileName, seperator, encoding="utf8"):
with open(fileName, 'r', encoding=encoding) as panelf:
panelf.readline() # skip header
lines = []
data = panelf.readlines()
for line in data:
line = line.strip("\n").split(seperator)
lines.append(line)
return lines
panelData = readDataFromFile("Desktop/panel.csv", ",", encoding="utf-8")
yelpData = readDataFromFile("Desktop/yelp.csv", ",", encoding="utf-8")

How to this error ? utf-8' codec can't decode byte 0xef in position 32887: invalid continuation byte

enter image description here
Hello. I am trying to open this file which is in .txt format but it gives me an error.
Sometimes when you don't have uniform files you have to by specific with the correct encoding,
You should indicate it in function open for example,
with open(‘file.txt’, encoding = ‘utf-8’) as f:
etc
also you can detect the file encoding like this:
from chardet import detect
with open(file, 'rb') as f:
rawdata = f.read()
enc = detect(rawdata)['encoding']
with open(‘file.txt’, encoding = enc) as f:
etc
Result:
>>> from chardet import detect
>>>
>>> with open('test.c', 'rb') as f:
... rawdata = f.read()
... enc = detect(rawdata)['encoding']
...
>>> print(enc)
ascii
Python 3.7.0

Encoding and decoding string in Python

I want to write a string to a file using Python. I know how to do that, so that's not a problem. I also wish to encode that string once it has been written. The encoding doesn't really matter, so I'll stick to let's say UTF-32. What I do for that is after I wrote the string, I read from the file again, encode the string into bytes and then re-write to the same file. I can do the encoding part, but my problem arises with the decoding. I want to read it as bytes so that I can convert it back to a str. What I do for this is the same principle: Read from file, decode and write to the same file. What I get from reading the encoded string looks like b'\xff\xfe\x00\x001\x00\x00\x004\x00\x00\x002\x00\x00\x00'
When I read this as bytes, it doubles the b and the backslashes. If I read it like this, as a string, and then try to decode, it keeps saying 'str' object does not have attribute decode or something. I know that I can't decode the string, but if I try with bytes it seems to be "doubling" the bytes.
Here is my code:
def readfile(filename):
f = open(filename, 'r')
s = f.read()
f.close()
return s
def readfile_b(filename):
f = open(filename, 'rb')
s = f.read()
f.close()
return s
def writefile(filename, writeobject):
f = open(filename, 'w')
f.write(writeobject)
f.close()
def encode(filename):
s = readfile(filename)
s_enc = bytes(s, 'utf-32')
writefile(filename, str(s_enc))
def decode(filename):
s_enc = readfile_b(filename)
print(s_enc)
s = str(s_enc, 'utf-32')
writefile(filename, s)
encode("Example.txt")
decode("Example.txt")
Output (for decode(), encode() didn't have any errors):
b"b'\\xff\\xfe\\x00\\x00H\\x00\\x00\\x00e\\x00\\x00\\x00l\\x00\\x00\\x00l\\x00\\x00\\x00o\\x00\\x00\\x00'"
Traceback (most recent call last):
File "C:/bla/bla/bla/bla/Example.py", line 29, in <module>
decode("MamaAccount.txt")
File "C:/bla/bla/bla/bla/Example.py", line 26, in decode
s = str(s_enc, 'utf-32')
UnicodeDecodeError: 'utf-32-le' codec can't decode bytes in position 0-3: code point not in range(0x110000)
Any help is greatly appreciated
Try using writefile with binary writing. Currently you are writing the bytes casted to a string. When you read that back you'll get a b or 2.
This works for me:
def readfile(filename):
f = open(filename, 'r')
s = f.read()
f.close()
return s
def readfile_b(filename):
f = open(filename, 'rb')
s = f.read()
f.close()
return s
def writefile(filename, writeobject):
f = open(filename, 'w')
f.write(writeobject)
f.close()
def writefile_b(filename, writeobject):
f = open(filename, 'wb')
f.write(writeobject)
f.close()
def encode(filename):
s = readfile(filename)
s_enc = bytes(s, 'utf-32')
writefile_b("bin_"+filename, s_enc)
def decode(filename):
s_enc = readfile_b(filename)
#print(s_enc)
s = str(s_enc, 'utf-32')
print(s)
writefile("dec_"+filename, s)
encode("Example.txt")
decode("bin_Example.txt")

codecs.ascii_decode(input, self.errors)[0] UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 318: ordinal not in range(128)

I am trying to open and readlines a .txt file that contains a large amount of text. Below is my code, i dont know how to solve this problem. Any help would be very appreciated.
file = input("Please enter a .txt file: ")
myfile = open(file)
x = myfile.readlines()
print (x)
when i enter the .txt file this is the full error message is displayed below:
line 10, in <module> x = myfile.readlines()
line 26, in decode return codecs.ascii_decode(input, self.errors)[0]
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 318: ordinal not in range(128)
Instead of using codecs, I solve it this way:
def test():
path = './test.log'
file = open(path, 'r+', encoding='utf-8')
while True:
lines = file.readlines()
if not lines:
break
for line in lines:
print(line)
You must give encoding param precisely.
You can also try to encode :
with open(file) as f:
for line in f:
line = line.encode('ascii','ignore').decode('UTF-8','ignore')
print(line)
#AndriiAbramamov is right, your shoud check that question, here is a way you can open your file which is also on that link
import codecs
f = codecs.open('words.txt', 'r', 'UTF-8')
for line in f:
print(line)
Another way is to use regex, so when you open the file you can remove any special character like double quotes and so on.

Appending hexlifyed content to file

file_1 = ('test.png')
with open(file_1, 'rb') as b:
file_hex = b.read()
binascii.hexlify(file_hex)
file_1_size = len(file_hex)
print (file_1_size)
file_new = open("test.tp", "a")
file_new.write(binascii.hexlify(file_hex))
file_new.close()
I've been trying to get this hexlifyed content appended to the file. I've even tried to apply the hexlifyed content to a variable of its own. like this,
file_1 = ('test.png')
with open(file_1, 'rb') as b:
file_hex = b.read()
x = binascii.hexlify(file_hex)
file_1_size = len(file_hex)
print (file_1_size)
file_new = open("test.tp", "a")
file_new.write(x)
file_new.close()
both end with error
TypeError: must be str, not bytes
Open your file in binary mode to append bytes:
with open("test.tp", "ab") as file_new:
file_new.write(x)
or decode your bytes to a string first:
with open("test.tp", "a") as file_new:
file_new.write(x.decode('ascii')
Hex digits fall within the ASCII code range, so decoding with that codec is safe.

Categories

Resources