Unicode Decode Error in Python with files - python

so I'm having this trouble with the decode. I found it in other threads how to do it for simple strings, with the u'string'.encode. But I can't find a way to make it work with files.
Any help would be appreciated!
Here's the code.
text = file.read()
text.replace(txt.encode('utf-8'), novo_txt.encode('utf-8'))
file.seek(0) # rewind
file.write(text.encode('utf-8'))
and here's the whole code, should it help.
#!/usr/bin/env python
# coding: utf-8
"""
Script to helps on translate some code's methods from
portuguese to english.
"""
from multiprocessing import Pool
from mock import MagicMock
from goslate import Goslate
import fnmatch
import logging
import os
import re
import urllib2
_MAX_PEERS = 1
try:
os.remove('traducoes.log')
except OSError:
pass
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler('traducoes.log')
logger.addHandler(handler)
def fileWalker(ext, dirname, names):
"""
Find the files with the correct extension
"""
pat = "*" + ext[0]
for f in names:
if fnmatch.fnmatch(f, pat):
ext[1].append(os.path.join(dirname, f))
def encontre_text(file):
"""
find on the string the works wich have '_' on it
"""
text = file.read().decode('utf-8')
return re.findall(r"\w+(?<=_)\w+", text)
#return re.findall(r"\"\w+\"", text)
def traduza_palavra(txt):
"""
Translate the word/phrase to english
"""
try:
# try connect with google
response = urllib2.urlopen('http://google.com', timeout=2)
pass
except urllib2.URLError as err:
print "No network connection "
exit(-1)
if txt[0] != '_':
txt = txt.replace('_', ' ')
txt = txt.replace('media'.decode('utf-8'), 'média'.decode('utf-8'))
gs = Goslate()
#txt = gs.translate(txt, 'en', gs.detect(txt))
txt = gs.translate(txt, 'en', 'pt-br') # garantindo idioma tupiniquim
txt = txt.replace(' en ', ' br ')
return txt.replace(' ', '_') # .lower()
def subistitua(file, txt, novo_txt):
"""
should rewrite the file with the new text in the future
"""
text = file.read()
text.replace(txt.encode('utf-8'), novo_txt.encode('utf-8'))
file.seek(0) # rewind
file.write(text.encode('utf-8'))
def magica(File):
"""
Thread Pool. Every single thread should play around here with
one element from list os files
"""
global _DONE
if _MAX_PEERS == 1: # inviavel em multithread
logger.info('\n---- File %s' % File)
with open(File, "r+") as file:
list_txt = encontre_text(file)
for txt in list_txt:
novo_txt = traduza_palavra(txt)
if txt != novo_txt:
logger.info('%s -> %s [%s]' % (txt, novo_txt, File))
subistitua(file, txt, novo_txt)
file.close()
print File.ljust(70) + '[OK]'.rjust(5)
if __name__ == '__main__':
try:
response = urllib2.urlopen('http://www.google.com.br', timeout=1)
except urllib2.URLError as err:
print "No network connection "
exit(-1)
root = './app'
ex = ".py"
files = []
os.path.walk(root, fileWalker, [ex, files])
print '%d files found to be translated' % len(files)
try:
if _MAX_PEERS > 1:
_pool = Pool(processes=_MAX_PEERS)
result = _pool.map_async(magica, files)
result.wait()
else:
result = MagicMock()
result.successful.return_value = False
for f in files:
pass
magica(f)
result.successful.return_value = True
except AssertionError, e:
print e
else:
pass
finally:
if result.successful():
print 'Translated all files'
else:
print 'Some files were not translated'
Thank you all for the help!

In Python 2, reading from files produces regular (byte) string objects, not unicode objects. There is no need to call .encode() on these; in fact, that'll only trigger an automatic decode to Unicode first, which can fail.
Rule of thumb: use a unicode sandwich. Whenever you read data, you decode to unicode at that stage. Use unicode values throughout your code. Whenever you write data, encode at that point. You can use io.open() to open file objects that encode and decode automatically for you.
That also means you can use unicode literals everywhere; for your regular expressions, for your string literals. So use:
def encontre_text(file):
text = file.read() # assume `io.open()` was used
return re.findall(ur"\w+(?<=_)\w+", text) # use a unicode pattern
and
def subistitua(file, txt, novo_txt):
text = file.read() # assume `io.open()` was used
text = text.replace(txt, novo_txt)
file.seek(0) # rewind
file.write(text)
as all string values in the program are already unicode, and
txt = txt.replace(u'media', u'média')
as u'..' unicode string literals don't need decoding anymore.

Related

How to read special characters encoded in UTF-8 in python

I was trying to extract some data from mysql database using python, But I have problem with special characters (the data are strings in FR, ES, De and IT languages). Whenever a word has a special character (like an accent á ñ etc.) are no encoded properly in the file (I'm creating a csv with the extracted data)
This is the code I was using
import mysql.connector
if __name__ == '__main__':
cnx = mysql.connector.connect(user='user', password='psswrd',
host='slave',
database='DB',
buffered=True)
us_id_list = ['496305']
f = open('missing_cat_mappings.csv', 'w')
for (us_id) in us_id_list:
print us_id
mapping_cursor = cnx.cursor()
query = (format(user_id=us_id,))
success = False
fails = 0
while not success:
try:
print "try" + str(fails)
mapping_cursor.execute(query)
success = True
except:
fails += 1
if fails > 10:
raise
for row in mapping_cursor:
f.write(str(row) + "\n")
mapping_cursor.close()
f.close()
cnx.close()
I added:
#!/usr/bin/python
# vim: set fileencoding=<UTF-8> :
at the beggining but it didn't make any difference.
Basically you will need to open the CSV file in binary mode, 'wb' not text mode 'w'

How do I perform error handling with two files?

So , I am having two files , so to checks its validity I am performing try and except two times . But I don't thinks this is a good method, can you suggest a better way?
Here is my code:
def form_density_dictionary(self,word_file,fp_exclude):
self.freq_dictionary={}
try:
with open(fp_exclude,'r')as fp2:
words_excluded=fp2.read().split() #words to be excluded stored in a list
print("**Read file successfully :" + fp_exclude + "**")
words_excluded=[words.lower() for words in words_excluded] # converted to lowercase
except IOError:
print("**Could not read file:", fp_exclude, " :Please check file name**")
sys.exit()
try:
with open(word_file,'r') as file:
print("**Read file successfully :" + word_file + "**")
words_list=file.read()
if not words_list:
print("**No data in file:",word_file +":**")
sys.exit()
words_list=words_list.split()
words_list=[words.lower() for words in words_list] # lowercasing entire list
unique_words=list((set(words_list)-set(words_excluded)))
self.freq_dictionary= {word:("%6.2f"%(float((words_list.count(word))/len(words_list))*100)) for word in unique_words}
#print((len(self.freq_dictionary)))
except IOError:
print("**Could not read file:", word_file, " :Please check file name**")
sys.exit()
Any other suggestion is also welcomed to make it more pythonic.
The first thing that jumps out is the lack of consistency and readability: in some lines you indent with 4 spaces, on others you only use two; in some places you put a space after a comma, in others you don't, in most places you don't have spaces around the assignment operator (=)...
Be consistent and make your code readable. The most commonly used formatting is to use four spaces for indenting and to always have a space after a comma but even more important than that is to be consistent, meaning that whatever you choose, stick with it throughout your code. It makes it much easier to read for everyone, including yourself.
Here are a few other things I think you could improve:
Have a single exception handling block instead of two.
You can also open both files in a single line.
Even better, combine both previous suggestions and have a separate method to read data from the files, thus eliminating code repetition and making the main method easier to read.
For string formatting it's preferred to use .format() instead of %. Check this out: https://pyformat.info/
Overall try to avoid repetition in your code. If there's something you're doing more than once, extract it to a separate function or method and use that instead.
Here's your code quickly modified to how I'd probably write it, and taking these things into account:
import sys
class AtifImam:
def __init__(self):
self.freq_dictionary = {}
def form_density_dictionary(self, word_file, exclude_file):
words_excluded = self.read_words_list(exclude_file)
words_excluded = self.lowercase(words_excluded)
words_list = self.read_words_list(word_file)
if len(words_list) == 0:
print("** No data in file: {} **".format(word_file))
sys.exit()
words_list = self.lowercase(words_list)
unique_words = list((set(words_list) - set(words_excluded)))
self.freq_dictionary = {
word: ("{:6.2f}".format(
float((words_list.count(word)) / len(words_list)) * 100))
for word in unique_words
}
#staticmethod
def read_words_list(file_name):
try:
with open(file_name, 'r') as file:
data = file.read()
print("** Read file successfully: {} **".format(file_name))
return data.split()
except IOError as e:
print("** Could not read file: {0.filename} **".format(e))
sys.exit()
#staticmethod
def lowercase(word_list):
return [word.lower() for word in word_list]
Exceptions thrown that involve a file system path have a filename attribute that can be used instead of explicit attributes word_file and fp_exclude as you do.
This means you can wrap these IO operations in the same try-except and use the exception_instance.filename which will indicate in which file the operation couldn't be performed.
For example:
try:
with open('unknown_file1.py') as f1, open('known_file.py') as f2:
f1.read()
f2.read()
except IOError as e:
print("No such file: {0.filename}".format(e))
Eventually prints out:
No such file: unknown_file1.py
While the opposite:
try:
with open('known_file.py') as f1, open('unknown_file2.py') as f2:
f1.read()
f2.read()
except IOError as e:
print("No such file: {0.filename}".format(e))
Prints out:
No such file: unknown_file2.py
To be more 'pythonic' you could use something what is callec Counter, from collections library.
from collections import Counter
def form_density_dictionary(self, word_file, fp_exclude):
success_msg = '*Read file succesfully : {filename}'
fail_msg = '**Could not read file: {filename}: Please check filename'
empty_file_msg = '*No data in file :{filename}:**'
exclude_read = self._file_open(fp_exclude, success_msg, fail_msg, '')
exclude = Counter([word.lower() for word in exclude_read.split()])
word_file_read = self._file_open(word_file, success_msg, fail_msg, empty_file_msg)
words = Counter([word.lower() for word in word_file_read.split()])
unique_words = words - excluded
self.freq_dictionary = {word: '{.2f}'.format(count / len(unique_words))
for word, count in unique_words.items()}
Also it would be better if you would just create the open_file method, like:
def _open_file(self, filename, success_msg, fails_msg, empty_file_msg):
try:
with open(filename, 'r') as file:
if success_msg:
print(success_msg.format(filename= filename))
data = file.read()
if empty_file_msg:
print(empty_file_msg.format(filename= filename))
return data
except IOError:
if fail_msg:
print(fail_msg.format(filename= filename))
sys.exit()

Python 3.4: Trying to get full name results from nltk

I am a beginner in Python, and I am trying to collect the names from a txt and put them inside another txt file using NLTK. The issue is that only the first names are returned, without the surnames. Anything I can do? Here's the code:
import nltk
# function start
def extract_entities(text):
ind = len(text)-7
sub = text[ind:]
print(sub)
output.write('\nPRODID=='+sub+'\n\n')
for sent in nltk.sent_tokenize(text):
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
if hasattr(chunk, 'label'):
output.write(chunk.label()+':'+ ' '.join(c[0] for c in chunk.leaves())+'\n')
# function end
# main program
# -*- coding: utf-8 -*-
import sys
import codecs
sys.stdout = codecs.getwriter("iso-8859-1")(sys.stdout, 'xmlcharrefreplace')
if sys.stdout.encoding != 'cp850':
sys.stdout = codecs.getwriter('cp850')(sys.stdout.buffer, 'strict')
if sys.stderr.encoding != 'cp850':
sys.stderr = codecs.getwriter('cp850')(sys.stderr.buffer, 'strict')
file = open('C:\Python34\Description.txt', 'r')
output = open('C:\Python34\out.txt', 'w')
for line in file:
if not line : continue
extract_entities(line)
file.close()
output.close()
Thanks in advance for your answers!

Best way to read file contents of a file and set to empty string if exception happen

try:
content = open("/tmp/out").read()
except:
content = ""
Can I go any shorter or more elegant than this? I've to do it for more than one files so I want something more short.
Is writing function the only shorter way to do it?
What I actually want is this but I want to concat "" if there is any exception
lines = (open("/var/log/log.1").read() + open("/var/log/log").read()).split("\n")
Yes, you'll have to write something like
def get_contents(filename):
try:
with open(filename) as f:
return f.read()
except EnvironmentError:
return ''
lines = (get_contents('/var/log/log.1')
+ get_contents('/var/log/log')).split('\n')
NlightNFotis raises a valid point, if the files are big, you don't want to do this. Maybe you'd write a line generator that accepts a list of filenames:
def get_lines(filenames):
for fname in filenames:
try:
with open(fname) as f:
for line in f:
yield line
except EnvironmentError:
continue
...
for line in get_lines(["/var/log/log.1", "/var/log/log"]):
do_stuff(line)
Another way is to use the standard fileinput.FileInput class (thanks, J.F. Sebastian):
import fileinput
def eat_errors(f, mode):
try:
return open(f, mode)
except IOError:
return open(os.devnull)
for line in fileinput.FileInput(["/var/log/log.1", "/var/log/log"], openhook=eat_errors):
do_stuff(line)
This code will monkey patch out open for another open that creates a FakeFile that always returns a "empty" string if open throws an `IOException``.
Whilst it's more code than you'd really want to write for the problem at hand, it does mean that you have a reusable context manager for faking open if the need arises again (probably twice in the next decade)
with monkey_patched_open():
...
Actual code.
#!/usr/bin/env python
from contextlib import contextmanager
from StringIO import StringIO
################################################################################
class FakeFile(StringIO):
def __init__(self):
StringIO.__init__(self)
self.count = 0
def read(self, n=-1):
return "<empty#1>"
def readlines(self, sizehint = 0):
return ["<empty#2>"]
def next(self):
if self.count == 0:
self.count += 1
return "<empty#3>"
else:
raise StopIteration
################################################################################
#contextmanager
def monkey_patched_open():
global open
old_open = open
def new_fake_open(filename, mode="r"):
try:
fh = old_open(filename, mode)
except IOError:
fh = FakeFile()
return fh
open = new_fake_open
try:
yield
finally:
open = old_open
################################################################################
with monkey_patched_open():
for line in open("NOSUCHFILE"):
print "NOSUCHFILE->", line
print "Other", open("MISSING").read()
print "OK", open(__file__).read()[:30]
Running the above gives:
NOSUCHFILE-> <empty#3>
Other <empty#1>
OK #!/usr/bin/env python
from co
I left in the "empty" strings just to show what was happening.
StringIO would have sufficed just to read it once but I thought the OP was looking to keep reading from file, hence the need for FakeFile - unless someone knows of a better mechanism.
I know some see monkey patching as the act of a scoundrel.
You could try the following, but it's probably not the best:
import os
def chk_file(filename):
if os.stat(filename).st_size == 0:
return ""
else:
with open(filename) as f:
return f.readlines()
if __name__=="__main__":
print chk_file("foobar.txt") #populated file
print chk_file("bar.txt") #empty file
print chk_file("spock.txt") #populated
It works. You can wrap it with your try-except, if you want.
You could define a function to catch errors:
from itertools import chain
def readlines(filename):
try:
with open(filename) as file:
return file.readlines() # or just `file` to return an iterator
except EnvironmentError:
return []
files = (readlines(name) for name in ["/var/log/1", "/var/log/2"])
lines = list(chain.from_iterable(files))

writing file & displaying result in idle

trying to process text file and would like to see output in idle as well as redirected to text file. due to text formatting, i need to keep statement in print function (end = ''). i am looking for a way to use "end = ''" and "file=output_file" simultaneously with print function.
import re
input_file = open ('E:\input.txt','r')
output_file = open ('E:\output.txt','w')
for line in input_file:
if re.match('ab|cd', line):
print ('line with ab or cd: ', end = '',file=output_file )
print (line, end = '',file=output_file)
print (' --------', file=output_file)
print (' --------',file=output_file)
print ('!',file=output_file)
else:
print (line,file=output_file)
To write to several places using a single print(), you could define a file-like object:
class TeeFile(object):
def __init__(self, *files):
self.files = files
def write(self, data):
for f in self.files:
f.write(data)
def flush(self):
for f in self.files:
f.flush()
Example:
import sys
file = TeeFile(sys.stderr, output_file)
print("abc", end="", file=file)
file.flush()

Categories

Resources