#!/usr/bin/env python3
import glob
import xml.etree.ElementTree as ET
filenames = glob.glob("C:\\Users\\####\\Desktop\\BNC2\\[A00-ZZZ]*.xml")
out_lines = []
for filename in filenames:
with open(filename, 'r', encoding="utf-8") as content:
tree = ET.parse(content)
root = tree.getroot()
for w in root.iter('w'):
lemma = w.get('hw')
pos = w.get('pos')
tag = w.get('c5')
out_lines.append(w.text + "," + lemma + "," + pos + "," + tag)
with open("C:\\Users\\####\\Desktop\\bnc.txt", "w") as out_file:
for line in out_lines:
line = bytes(line, 'utf-8').decode('utf-8', 'ignore')
out_file.write("{}\n".format(line))
Gives the error:
UnicodeEncodeError: 'charmap' codec can't encode character '\u2192' in position 0: character maps to undefined
I thought this line would have solved that:
line = bytes(line, 'utf-8').decode('utf-8', 'ignore')
You need to specify the encoding when opening the output file, same as you did with the input file:
with open("C:\\Users\\####\\Desktop\\bnc.txt", "w", encoding="utf-8") as out_file:
for line in out_lines:
out_file.write("{}\n".format(line))
If your script have multiple reads and writes and you want to have a particular encoding ( let's say utf-8) for all of them, we can change the default encoding too
import sys
reload(sys)
sys.setdefaultencoding('UTF8')
We should use it only when we have multiple reads/writes though and should be done at the beginning of the script
Changing default encoding of Python?
Related
I have a .txt file in UTF-16-LE encoding .
I want to remove the headers(1st row) and save it in ANSI
I can do it maually but i need to do that for 150 txt files EVERY day
So i wanted to use Python to do it automatically.
But i am stuck ,
i have tried this code but it is not working ,produces an error :
*"return mbcs_encode(input, self.errors)[0]
UnicodeEncodeError: 'mbcs' codec can't encode characters in position 0--1: invalid character "*
filename = "filetochangecodec.txt"
path = "C:/Users/fallen/Desktop/New folder/"
pathfile = path + filename
coding1 = "utf-16-le"
coding2 = "ANSI"
f= open(pathfile, 'r', encoding=coding1)
content= f.read()
f.close()
f= open(pathfile, 'w', encoding=coding2)
f.write(content)
f.close()
A noble contributer helped me with the solution and i now post it so everyone can benefit and save time.
Instead of trying to write all the content , we make a list with every line of the txt file and then we write them in a new file one by one with the use of " for " .
import os
inpath = r"C:/Users/user/Desktop/insert/"
expath = r"C:/Users/user/Desktop/export/"
encoding1 = "utf-16"
encoding2 = "ansi"
input_filename = "text.txt"
input_pathfile = os.path.join(inpath, input_filename)
output_filename = "new_text.txt"
output_pathfile = os.path.join(expath, output_filename)
with open(input_pathfile, 'r', encoding=encoding1) as file_in:
lines = []
for line in file_in:
lines.append(line)
with open(output_pathfile, 'w', encoding='ANSI') as f:
for line in lines:
f.write(line)
When I use the CountVectorizer in sklearn, it needs the file encoding in unicode, but my data file is encoding in ansi.
I tried to change the encoding to unicode using notepad++, then I use readlines, it cannot read all the lines, instead it can only read the last line. After that, I tried to read the line into data file, and write them into the new file by using unicode, but I failed.
def merge_file():
root_dir="d:\\workspace\\minibatchk-means\\data\\20_newsgroups\\"
resname='resule_final.txt'
if os.path.exists(resname):
os.remove(resname)
result = codecs.open(resname,'w','utf-8')
num = 1
for back_name in os.listdir(r'd:\\workspace\\minibatchk-means\\data\\20_newsgroups'):
current_dir = root_dir + str(back_name)
for filename in os.listdir(current_dir):
print num ,":" ,str(filename)
num = num+1
path=current_dir + "\\" +str(filename)
source=open(path,'r')
line = source.readline()
line = line.strip('\n')
line = line.strip('\r')
while line !="":
line = unicode(line,"gbk")
line = line.replace('\n',' ')
line = line.replace('\r',' ')
result.write(line + ' ')
line = source.readline()
else:
print 'End file :'+ str(filename)
result.write('\n')
source.close()
print 'End All.'
result.close()
The error message is :UnicodeDecodeError: 'gbk' codec can't decode bytes in position 0-1: illegal multibyte sequence
Oh,I find the way.
First, use chardet to detect string encoding.
Second,use codecs to input or output to the file in the specific encoding.
Here is the code.
import chardet
import codecs
import os
root_dir="d:\\workspace\\minibatchk-means\\data\\20_newsgroups\\"
num = 1
failed = []
for back_name in os.listdir("d:\\workspace\\minibatchk-means\\data\\20_newsgroups"):
current_dir = root_dir + str(back_name)
for filename in os.listdir(current_dir):
print num,":",str(filename)
num=num+1
path=current_dir+"\\"+str(filename)
content = open(path,'r').read()
source_encoding=chardet.detect(content)['encoding']
if source_encoding == None:
print '??' , filename
failed.append(filename)
elif source_encoding != 'utf-8':
content=content.decode(source_encoding,'ignore')
codecs.open(path,'w',encoding='utf-8').write(content)
print failed
Thanks for all your help.
I have written code to create frequency table. but it is breaking at the line ext_string = document_text.read().lower(. I even put a try and except to catch the error but it is not helping.
import re
import string
frequency = {}
file = open('EVG_text mining.txt', encoding="utf8")
document_text = open('EVG_text mining.txt', 'r')
text_string = document_text.read().lower()
match_pattern = re.findall(r'\b[a-z]{3,15}\b', text_string)
for word in match_pattern:
try:
count = frequency.get(word,0)
frequency[word] = count + 1
except UnicodeDecodeError:
pass
frequency_list = frequency.keys()
for words in frequency_list:
print (words, frequency[words])
You are opening your file twice, the second time without specifying the encoding:
file = open('EVG_text mining.txt', encoding="utf8")
document_text = open('EVG_text mining.txt', 'r')
You should open the file as follows:
frequencies = {}
with open('EVG_text mining.txt', encoding="utf8", mode='r') as f:
text = f.read().lower()
match_pattern = re.findall(r'\b[a-z]{3,15}\b', text)
...
The second time you were opening your file, you were not defining what encoding to use which is probably why it errored.
The with statement helps perform certain task linked with I/O for a file. You can read more about it here: https://www.pythonforbeginners.com/files/with-statement-in-python
You should probably have a look at error handling as well as you were not enclosing the line that was actually causing the error: https://www.pythonforbeginners.com/error-handling/
The code ignoring all decoding issues:
import re
import string # Do you need this?
with open('EVG_text mining.txt', mode='rb') as f: # The 'b' in mode changes the open() function to read out bytes.
bytes = f.read()
text = bytes.decode('utf-8', 'ignore') # Change 'ignore' to 'replace' to insert a '?' whenever it finds an unknown byte.
match_pattern = re.findall(r'\b[a-z]{3,15}\b', text)
frequencies = {}
for word in match_pattern: # Your error handling wasn't doing anything here as the error didn't occur here but when reading the file.
count = frequencies.setdefault(word, 0)
frequencies[word] = count + 1
for word, freq in frequencies.items():
print (word, freq)
To read a file with some special characters, use encoding as 'latin1' or 'unicode_escape'
I am using the code below to parse the XML format wikipedia training data into a pure text file:
from __future__ import print_function
import logging
import os.path
import six
import sys
from gensim.corpora import WikiCorpus
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) != 3:
print("Using: python process_wiki.py enwiki.xxx.xml.bz2 wiki.en.text")
sys.exit(1)
inp, outp = sys.argv[1:3]
space = " "
i = 0
output = open(outp, 'w')
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
for text in wiki.get_texts():
if six.PY3:
output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
# ###another method###
# output.write(
# space.join(map(lambda x:x.decode("utf-8"), text)) + '\n')
else:
output.write(space.join(text) + "\n")
i = i + 1
if (i % 10000 == 0):
logger.info("Saved " + str(i) + " articles")
output.close()
logger.info("Finished Saved " + str(i) + " articles")
when I run this code, it gives me a following error message:
File "wiki_parser.py", line 42, in <module>
output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
UnicodeEncodeError: 'cp949' codec can't encode character '\u1f00' in position 1537: illegal multibyte sequence
When I searched this error online, most answers told me to add 'utf-8' as the encoding which is already there. What could be the possible issue with the code?
Minimal example
The problem is that your file is opened with an implicit encoding (inferred from your system). I can recreate your issue as follows:
a = '\u1f00'
with open('f.txt', 'w', encoding='cp949') as f:
f.write(a)
Error message: UnicodeEncodeError: 'cp949' codec can't encode character '\u1f00' in position 0: illegal multibyte sequence
You have two options. Either open the file using an encoding which can encode the character you are using:
with open('f.txt', 'w', encoding='utf-8') as f:
f.write(a)
Or open the file as binary and write encoded bytes:
with open('f.txt', 'wb') as f:
f.write(a.encode('utf-8'))
Applied to your code:
I would replace this part:
output = open(outp, 'w')
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
for text in wiki.get_texts():
if six.PY3:
output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
# ###another method###
# output.write(
# space.join(map(lambda x:x.decode("utf-8"), text)) + '\n')
else:
output.write(space.join(text) + "\n")
with this:
from io import open
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
with open(outp, 'w', encoding='utf=8') as output:
for text in wiki.get_texts():
output.write(u' '.join(text) + u'\n')
which should work in both Python 2 and Python 3.
I am having an encoding issue when I run my script below:
Here is the error code:
-UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 9: ordinal not in range(128)
Here is my script:
import logging
import urllib
import csv
import json
import io
import codecs
with open('/home/local/apple.csv',
'rb') as csvinput:
reader = csv.reader(csvinput, delimiter=',')
firstline = True
for row in reader:
if firstline:
firstline = False
continue
address1 = row[0]
print row[0]
locality = row[1]
admin_area = row[2]
query = ' '.join(str(x) for x in (address1, locality, admin_area))
normalized = query.replace(" ", "+")
BaseURL = 'http://localhost:8080/verify?country=JP&freeform='
URL = BaseURL + normalized
print URL
data = urllib.urlopen(URL)
response = data.getcode()
print response
if response == 200:
file= json.load(data)
print file
output_f=open('output.csv','wb')
csvwriter=csv.writer(output_f)
count = 0
for f in file:
if count == 0:
header= f.keys()
csvwriter.writerow(header)
count += 1
csvwriter.writerow(f.values())
output_f.close()
else:
print 'error'
can anyone help me fix this its getting really annoying. I need to encode to utf8
Looks like you are using Python 2.x, instead of python's standard open, use codecs.open where you can optionally pass an encoding to use and what to do when there are errors. Gets a little less confusing in Python 3 where the standard Python open can do this.
So in your two lines where you are opening, do:
with codecs.open('/home/local/apple.csv',
'rb', 'utf-8') as csvinput:
output_f = codecs.open('output.csv','wb', 'utf-8')
The optional error parm defaults to "strict" which raises an exception if the bytes can't be mapped to the given encoding. In some contexts you may want to use 'ignore' or 'replace'.
See the python doc for a bit more info.