I am making a program that uses words with Swedish characters and stores them in a list. I can print Swedish characters before I put them into a list, but after they are put in, they do not appear normally, just a big mess of characters.
Here is my code:
# coding=UTF-8
def get_word(lines, eng=0):
if eng == 1: #function to get word in english
word_start = lines[1]
def do_format(word, lang):
if lang == "sv":
first_word = word
second_word = translate(word, lang)
element = first_word + " - " + second_word
elif lang == "en":
first_word = translate(word, lang)
second_word = word
element = first_word + " - " + second_word
return element
def translate(word, lang):
if lang == "sv":
return "ENGLISH"
if lang == "en":
return "SWEDISH"
translated = []
path = "C:\Users\LK\Desktop\Dropbox\Dokumentai\School\Swedish\V47.txt"
doc = open(path, 'r') #opens the documen
doc_list = [] #the variable that will contain list of words
for lines in doc.readlines(): #repeat as many times as there are lines
if len(lines) > 1: #ignore empty spaces
lines = lines.rstrip() #don't add "\n" at the end
doc_list.append(lines) #add to the list
for i in doc_list:
print i
for i in doc_list:
if "-" in i:
if i[0] == "-":
element = do_format(i[2:], "en")
translated.append(element)
else:
translated.append(i)
else:
element = do_format(i, "sv")
translated.append(element)
print translated
raw_input()
I can simplify the problem to a simple code as:
# -*- coding: utf-8 -*-
test_string = "ö"
test_list = ["å"]
print test_string, test_list
If I run that, I get this
ö ['\xc3\xa5']
There are multiple things to notice:
The broken character. This seems to happen because your python seems to output UTF-8 but your terminal seems to be configured to some ISO-8859-X mode (hence the two characters). I'd try to use proper unicode strings in Python 2! (always u"ö" instead of "ö"). And check your locale settings (locale command when on linux)
The weird string in the list. In Python print e will print out str(e). For lists (such as ["å"]) the implementation of __str__ is the same as __repr__. And since repr(some_list) will call repr on any of the elements contained in the list, you end up with the string you see.
Example for repr(string):
>>> print u"ö"
ö
>>> print repr(u"ö")
u'\xf6'
>>> print repr("ö")
'\xc3\xb6'
If you print list then it can be print as some structure. You should convert it to string for example by using join() string method. With your test code it may looks like:
print test_string, test_list
print('%s, %s, %s' % (test_string, test_list[0], ','.join(test_list)))
And output:
ö ['\xc3\xa5']
ö, å, å
I think in your main program you can:
print('%s' % (', '.join(translated)))
You can use codecs module to specify encoding of the read bytes.
import codecs
doc = codecs.open(path, 'r', encoding='utf-8') #opens the document
Files opened with codecs.open will give you unicode string after decoding the raw bytes with specified encoding.
In you code, prefix your string literals with u, to make them unicode string.
# -*- coding: utf-8 -*-
test_string = u"ö"
test_list = [u"å"]
print test_string, test_list[0]
Related
I'm trying to write Python code to detect the first non-whitespace character in each line in a file and check if it is a "}". For example, if the contents of my file are...
a
fox {
}
{ jumped
} up
... I'd want to detect the "}" in the third line despite it having two whitespaces, and in the fifth line.
I've tried doing something like this but I'm stuck:
full_file = open ("filename", "r")
each_line = full_file.readlines()
for item in each_line
if item[0].find('}') != -1:
# do something, such as print (item)
full_file.close()
Help much appreciated!
You may try calling strip() on each line, then checking just the first character:
full_file = open ("filename", "r")
each_line = full_file.readlines()
for item in each_line
if item.strip() and item.strip()[0] == '}':
print(item)
full_file.close()
You could use this function to get the first non whitespace character.
This only reads one line at a time, so it could save you some trouble if you're dealing with a very large file.
def get_first_non_whitespace_char(file_name):
with open(file_name, 'r') as fileobj:
for line in fileobj:
for character in line:
if character != " ":
return character
return None
Trying out with a file like this:
sample.txt
}hello
Using the function
file_name = "sample.txt"
first_nsp_char = get_first_non_whitespace_char(file_name)
if first_nsp_char != None:
print("First Non space character in file:", first_nsp_char)
print("Character is '}' :", first_nsp_char == '}')
else:
print("No non-whitespace characters found in file:", file_name)
Output:
First Non space character in file: }
Character is '}' : True
my version on slight different input (sample.txt has two empty lines at the end, '\n'):
a
fox {
}
{ jumped
} up
is:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 10 11:28:29 2021
#author: Pietro
https://stackoverflow.com/questions/67733277/how-can-i-detect-the-first-non-whitespace-character-in-each-line-of-a-file-opene/67733429#67733429
"""
def get_first_non_whitespace_char(file_name):
listz ={}
index = 0
with open(file_name, 'r') as fileobj:
for line in fileobj:
index += 1
#print('line: ',line)
for character in line:
if character != " " and character != '\n':
listz[index] = character
# index += 1
break
else:
pass
return listz
file_name = "sample.txt"
first_nsp_char = get_first_non_whitespace_char(file_name)
for i in first_nsp_char:
print('line # ',i,' first character is : ',first_nsp_char[i])
output:
line # 1 first character is : a
line # 2 first character is : f
line # 3 first character is : }
line # 4 first character is : {
line # 7 first character is : }
I have a python line that has
/c/hossam/fawzy/
this sentence
I want to replace all the forward slashes whenever they are found in a line, I am really wanting to use the replace method. so here is what I've done.. and I've reached the point when I print "Yeah Found" whenever I see it in the for loop, but can't figure out how to replace it.
import sys
file = open(r"E:\AutomationTestPath\t2.py", 'r+')
contents = file.read()
ListContents = list(contents)
print(ListContents)
SearchingFor = '/'
for letter in ListContents:
if SearchingFor in letter:
print("Yeah Found")
Try:
import re
sentence = "/c/hossam/fawzy/"
sentence = re.sub(r"/", r"-", sentence) # Replace '/' with '-'
print(sentence)
Output:
-c-hossam-fawzy-
import re
file = open(r"E:\AutomationTestPath\t2.py", 'r+') #Opening The FIle
contents = file.read()
Sentence = str(contents) #Converting to String
sentence = re.sub(r"/", r"-", Sentence)
print(sentence)
I'm making an an algorithm to classify words with the number of times they appear in a text given by a file.
There is my method:
def printToFile(self, fileName):
file_to_print = open(fileName, 'w')
file_to_print.write(str(self))
file_to_print.close()
and there is the str:
def __str__(self):
cadena = ""
self.processedWords = collections.OrderedDict(sorted(self.processedWords.items()))
for key in self.processedWords:
cadena += str(key) + ": " + str(self.processedWords[key]) + "\n"
return cadena.decode('string_escape')
When I print the data through console there is no issues, nevertheless, when I do through file appears random characters.
This is should be the output to the file
This is the output given
This looks like a encoding issue, try opening the file like this:
open("file name","w",encoding="utf8")
Utf8 is the most popular encoding but it might not be the real encoding, you might have to check out other encodings such as utf16
I'm trying to replace special patterns in string by tabs. This string ( if i may call it) is a result from reading a file, that has accents (I'm portuguese, so UTF-8 or LATIN-1 is the encoding language).
So imagine my input is:
Aubrecht, Christoph; Özceylan, Aubrecht Dilek; Klerx, Joachim; Freire, Sérgio (2013) “Future-oriented activities as a concept for improved disaster risk management. Disaster Advances”, 6(12), 1-10. (IF = 2.272) E-ISSN 2278-4543. REVISTA INDEXADA NO WEB OF SCIENCE
Aubrecht, Christoph; Özceylan, Dilek; Steinnocher, Klaus; Freire, Sérgio (2013), “Multi-level geospatial modeling of human exposure patterns and vulnerability indicators”. Natural Hazards, 68:147-163. (IF = 1.639).. ISSN: 0921-030X (print version). ISSN: 1573-0840 (electronic version. Accession Number: WOS:000322724000008
Some of those special patterns are :
') "' --> '\t'
'), "' --> '\t'
'),"' --> '\t'
') "' --> '\t'
'),«' --> '\t'
'), «' --> '\t'
') "' --> '\t'
Until now I've tried using a dictionary to replace all those characters, but happens that the dictionary doesn't recognize some of those patterns. I know the re.sub function is the "man" for this (python replace space with special characters between strings) but that's cool when you have a predifined string, but when you read from a file, how do you do it?
My code:
# -*- coding: utf-8 -*-
import Tkinter as tk
import codecs, string, sys, re
root = tk.Tk()
root.title("Final?")
f = open('INPUT TEXT', 'r')
with codecs.open('INPUT TEXT', encoding='latin1') as f:
sentence = f.read()
if isinstance(sentence, unicode):
sentence = sentence.encode('latin1')
def results1():
print '\n', sentence
print results1, '\n'
key = {0:') "', 1:'replace'}
regx = re.compile('\t\t{[0]}\t\t'.format(key))
print( regx.sub(key[1],results1) )
def replace_all(text, dic):
for i, j in dic.iteritems():
text = text.replace(i,j)
return text
reps = {' (':'\t', ') "':'\t', '), "':'\t', '),"':'\t', ') "':'\t', '),«':'\t', '), «':'\t', ') "':'\t', 'p.':'\t', ',':' '}
converts = replace_all(sentence, reps)
def converts():
sys.stdout = open('output.txt', 'w')
converts = replace_all(sentence, reps)
print '\n', converts
results = tk.Button(root, text='Resultados', width=25, command=resultadosnormais)
results.pack()
txt = tk.Button(root, text='Conversor resultados', width=25, command=conversortexto)
txt.pack()
root.mainloop()
I saw this post too but I can't seem to apply it on my code in specific :Re.sub not working for me
But somehow it stores the function at somewhere but it gives a error right after that:
File "C:\Users\Joao\Desktop\Tryout2.py", line 30, in <module>
regx = re.compile('\t\t{[0]}\t\t'.format(key))
error: unbalanced parenthesis
so I'm having this trouble with the decode. I found it in other threads how to do it for simple strings, with the u'string'.encode. But I can't find a way to make it work with files.
Any help would be appreciated!
Here's the code.
text = file.read()
text.replace(txt.encode('utf-8'), novo_txt.encode('utf-8'))
file.seek(0) # rewind
file.write(text.encode('utf-8'))
and here's the whole code, should it help.
#!/usr/bin/env python
# coding: utf-8
"""
Script to helps on translate some code's methods from
portuguese to english.
"""
from multiprocessing import Pool
from mock import MagicMock
from goslate import Goslate
import fnmatch
import logging
import os
import re
import urllib2
_MAX_PEERS = 1
try:
os.remove('traducoes.log')
except OSError:
pass
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler('traducoes.log')
logger.addHandler(handler)
def fileWalker(ext, dirname, names):
"""
Find the files with the correct extension
"""
pat = "*" + ext[0]
for f in names:
if fnmatch.fnmatch(f, pat):
ext[1].append(os.path.join(dirname, f))
def encontre_text(file):
"""
find on the string the works wich have '_' on it
"""
text = file.read().decode('utf-8')
return re.findall(r"\w+(?<=_)\w+", text)
#return re.findall(r"\"\w+\"", text)
def traduza_palavra(txt):
"""
Translate the word/phrase to english
"""
try:
# try connect with google
response = urllib2.urlopen('http://google.com', timeout=2)
pass
except urllib2.URLError as err:
print "No network connection "
exit(-1)
if txt[0] != '_':
txt = txt.replace('_', ' ')
txt = txt.replace('media'.decode('utf-8'), 'média'.decode('utf-8'))
gs = Goslate()
#txt = gs.translate(txt, 'en', gs.detect(txt))
txt = gs.translate(txt, 'en', 'pt-br') # garantindo idioma tupiniquim
txt = txt.replace(' en ', ' br ')
return txt.replace(' ', '_') # .lower()
def subistitua(file, txt, novo_txt):
"""
should rewrite the file with the new text in the future
"""
text = file.read()
text.replace(txt.encode('utf-8'), novo_txt.encode('utf-8'))
file.seek(0) # rewind
file.write(text.encode('utf-8'))
def magica(File):
"""
Thread Pool. Every single thread should play around here with
one element from list os files
"""
global _DONE
if _MAX_PEERS == 1: # inviavel em multithread
logger.info('\n---- File %s' % File)
with open(File, "r+") as file:
list_txt = encontre_text(file)
for txt in list_txt:
novo_txt = traduza_palavra(txt)
if txt != novo_txt:
logger.info('%s -> %s [%s]' % (txt, novo_txt, File))
subistitua(file, txt, novo_txt)
file.close()
print File.ljust(70) + '[OK]'.rjust(5)
if __name__ == '__main__':
try:
response = urllib2.urlopen('http://www.google.com.br', timeout=1)
except urllib2.URLError as err:
print "No network connection "
exit(-1)
root = './app'
ex = ".py"
files = []
os.path.walk(root, fileWalker, [ex, files])
print '%d files found to be translated' % len(files)
try:
if _MAX_PEERS > 1:
_pool = Pool(processes=_MAX_PEERS)
result = _pool.map_async(magica, files)
result.wait()
else:
result = MagicMock()
result.successful.return_value = False
for f in files:
pass
magica(f)
result.successful.return_value = True
except AssertionError, e:
print e
else:
pass
finally:
if result.successful():
print 'Translated all files'
else:
print 'Some files were not translated'
Thank you all for the help!
In Python 2, reading from files produces regular (byte) string objects, not unicode objects. There is no need to call .encode() on these; in fact, that'll only trigger an automatic decode to Unicode first, which can fail.
Rule of thumb: use a unicode sandwich. Whenever you read data, you decode to unicode at that stage. Use unicode values throughout your code. Whenever you write data, encode at that point. You can use io.open() to open file objects that encode and decode automatically for you.
That also means you can use unicode literals everywhere; for your regular expressions, for your string literals. So use:
def encontre_text(file):
text = file.read() # assume `io.open()` was used
return re.findall(ur"\w+(?<=_)\w+", text) # use a unicode pattern
and
def subistitua(file, txt, novo_txt):
text = file.read() # assume `io.open()` was used
text = text.replace(txt, novo_txt)
file.seek(0) # rewind
file.write(text)
as all string values in the program are already unicode, and
txt = txt.replace(u'media', u'média')
as u'..' unicode string literals don't need decoding anymore.