Encoding foreign alphabet characters

Encoding foreign alphabet characters - python

I am getting data from an XML provided by an API that for some reason lists Czechslovak characters in a different encoding (e.g. instead of correct Czechoslovak "ý" it uses "Ã½"). Therefore, instead of providing the
correct output to the user -> "Zelený"
the output is -> "ZelenÃ½"
I went through multiple StackOverflow posts, other fora and tutorials, but I still cannot figure out how to make it turn "ZelenÃ½" into "Zelený" (this is just one of the weird characters used by the XML so I cannot use str.replace).
I figured out, that the correct encoding for the Czech/Slovak language is "windows-1250"
My code:
def change_encoding(what):
what = what.encode("windows-1250")
return what
clean_xml_input = change_encoding(xml_input)
This produces error:
'charmap' codec can't encode characters in position 5-6: character
maps to <undefined>

"ZelenÃ½".encode("Windows-1252").decode("utf-8") #'Zelený'
"ZelenÃ½".encode("windows-1254").decode("utf-8") #'Zelený'
"ZelenÃ½".encode("iso-8859-1").decode("utf-8") #'Zelený'
"ZelenÃ½".encode("iso-8859-9").decode("utf-8") #'Zelený'
If it is helpful
from itertools import permutations
all_encoding = ['ASMO-708',
'big5',
'cp1025',
'cp866',
'cp875',
'csISO2022JP',
'DOS-720',
'DOS-862',
'EUC-CN',
'EUC-JP',
'euc-jp',
'euc-kr',
'GB18030',
'gb2312',
'hz-gb-2312',
'IBM00858',
'IBM00924',
'IBM01047',
'IBM01140',
'IBM01141',
'IBM01142',
'IBM01143',
'IBM01144',
'IBM01145',
'IBM01146',
'IBM01147',
'IBM01148',
'IBM01149',
'IBM037',
'IBM1026',
'IBM273',
'IBM277',
'IBM278',
'IBM280',
'IBM284',
'IBM285',
'IBM290',
'IBM297',
'IBM420',
'IBM423',
'IBM424',
'IBM437',
'IBM500',
'ibm737',
'ibm775',
'ibm850',
'ibm852',
'IBM855',
'ibm857',
'IBM860',
'ibm861',
'IBM863',
'IBM864',
'IBM865',
'ibm869',
'IBM870',
'IBM871',
'IBM880',
'IBM905',
'IBM-Thai',
'iso-2022-jp',
'iso-2022-jp',
'iso-2022-kr',
'iso-8859-1',
'iso-8859-13',
'iso-8859-15',
'iso-8859-2',
'iso-8859-3',
'iso-8859-4',
'iso-8859-5',
'iso-8859-6',
'iso-8859-7',
'iso-8859-8',
'iso-8859-8-i',
'iso-8859-9',
'Johab',
'koi8-r',
'koi8-u',
'ks_c_5601-1987',
'macintosh',
'shift_jis',
'us-ascii',
'utf-16',
'utf-16BE',
'utf-32',
'utf-32BE',
'utf-7',
'utf-8',
'windows-1250',
'windows-1251',
'Windows-1252',
'windows-1253',
'windows-1254',
'windows-1255',
'windows-1256',
'windows-1257',
'windows-1258',
'windows-874',
'x-Chinese-CNS',
'x-Chinese-Eten',
'x-cp20001',
'x-cp20003',
'x-cp20004',
'x-cp20005',
'x-cp20261',
'x-cp20269',
'x-cp20936',
'x-cp20949',
'x-cp50227',
'x-EBCDIC-KoreanExtended',
'x-Europa',
'x-IA5',
'x-IA5-German',
'x-IA5-Norwegian',
'x-IA5-Swedish',
'x-iscii-as',
'x-iscii-be',
'x-iscii-de',
'x-iscii-gu',
'x-iscii-ka',
'x-iscii-ma',
'x-iscii-or',
'x-iscii-pa',
'x-iscii-ta',
'x-iscii-te',
'x-mac-arabic',
'x-mac-ce',
'x-mac-chinesesimp',
'x-mac-chinesetrad',
'x-mac-croatian',
'x-mac-cyrillic',
'x-mac-greek',
'x-mac-hebrew',
'x-mac-icelandic',
'x-mac-japanese',
'x-mac-korean',
'x-mac-romanian',
'x-mac-thai',
'x-mac-turkish',
'x-mac-ukrainian']
for i,j in permutations(all_encoding, 2):
try:
if("ZelenÃ½".encode(i).decode(j) == 'Zelený'):
print(f'encode with `{i}` and decode with `{j}`')
except:
pass

Related

'gbk' codec can't encode character '\u2022' in position 32: illegal multibyte sequence

There is a question about writting file.
when I used data.to_csv('/home/bio_kang/Learning/Python/film_project/top250_film_info.csv', index=None, encoding='gbk'), it given me a error that 'gbk' codec can't encode character '\u2022' in position 32: illegal multibyte sequence.
The data come from a website https://movie.douban.com/top250. I use requests , beautifulsoup and re to get them form the website.
And, here is my part code:
uni_num = []
years = []
countries = []
directors = []
actors = []
descriptions = []
for i in range(250):
with open('/home/bio_kang/Learning/Python/film_project/film_info/film_{}.html'.format(i), 'rb') as f:
film_info = f.read().decode('utf-8','ignore')
pattern_uni_num = re.compile(r'<span class="pl">IMDb:</span> (.*?)<br/>') # unique number
pattern_year = re.compile(r'<span class="year">\((.*?)\)</span>') # year
pattern_country = re.compile(r'<span class="pl">制片国家/地区:</span>(.*?)<br/>') # country
pattern_director = re.compile(r'<meta content=(.*?) property="video:director"/>') # director
pattern_actor = re.compile(r'<meta content="(.*?)" property="video:actor"/>') # actors
pattern_description = re.compile(r'<meta content="(.*?)property="og:description">') # description
uni_num.append(str(re.findall(pattern_uni_num, film_info)).strip("[]").strip("'"))
years.append(str(re.findall(pattern_year, film_info)).strip("[]").strip("'"))
countries.append(str(re.findall(pattern_country, film_info)).strip("[]").strip("'").split('/')[0])
directors.append(re.findall(pattern_director, film_info))
actors.append(re.findall(pattern_actor, film_info))
descriptions.append(str(re.findall(pattern_description, film_info)).strip('[]').strip('\''))
raw_data = {'encoding':uni_num, 'name':names, 'description':descriptions, 'country':countries, 'director':new_director, 'actor':new_actor, 'vote':new_votes, 'score':scores, 'year':years, 'link':urls }
data = pd.DataFrame(raw_data)
data.to_csv('/home/bio_kang/Learning/Python/film_project/top250_film_info.csv', index=None, encoding='gbk')

try that:
open('...','rb',encoding='utf-8')
or utf-16

Python and string accents

I am making a web scraper.
I access google search, I get the link of the web page and then I get the contents of the <title> tag.
The problem is that, for example, the string "P\xe1gina N\xe3o Encontrada!" should be "Página Não Encontrada!".
I tried do decode to latin-1 and then encode to utf-8 and it did not work.
r2 = requests.get(item_str)
texto_pagina = r2.text
soup_item = BeautifulSoup(texto_pagina,"html.parser")
empresa = soup_item.find_all("title")
print(empresa_str.decode('latin1').encode('utf8'))
Can you help me, please?
Thanks !

You can change the retrieved text variable to something like:
string = u'P\xe1gina N\xe3o Encontrada!'.encode('utf-8')
After printing string it seemed to work just fine for me.
Edit
Instead of adding .encode('utf8'), have you tried just using empresa_str.decode('latin1')?
As in:
string = empresa_str.decode('latin_1')

Not the most elegant solution, but worked for me :
def remove_all(substr, str):
index = 0
length = len(substr)
while string.find(str, substr) != -1:
index = string.find(str, substr)
str = str[0:index] + str[index+length:]
return str
def latin1_to_ascii (unicrap):
xlate={ 'xc3cb3':'o' , 'xc3xa7':'c','xc3xb5':'o', 'xc3xa3':'a', 'xc3xa9':'e',
'xc0':'A', 'xc1':'A', 'xc2':'A', 'xc3':'A', 'xc4':'A', 'xc5':'A',
'xc6':'Ae', 'xc7':'C',
'xc8':'E', 'xc9':'E', 'xca':'E', 'xcb':'E',
'xcc':'I', 'xcd':'I', 'xce':'I', 'xcf':'I',
'xd0':'Th', 'xd1':'N',
'xd2':'O', 'xd3':'O', 'xd4':'O', 'xd5':'O', 'xd6':'O', 'xd8':'O',
'xd9':'U', 'xda':'U', 'xdb':'U', 'xdc':'U',
'xdd':'Y', 'xde':'th', 'xdf':'ss',
'xe0':'a', 'xe1':'a', 'xe2':'a', 'xe3':'a', 'xe4':'a', 'xe5':'a',
'xe6':'ae', 'xe7':'c',
'xe8':'e', 'xe9':'e', 'xea':'e', 'xeb':'e',
'xec':'i', 'xed':'i', 'xee':'i', 'xef':'i',
'xf0':'th', 'xf1':'n',
'xf2':'o', 'xf3':'o', 'xf4':'o', 'xf5':'o', 'xf6':'o', 'xf8':'o',
'xf9':'u', 'xfa':'u', 'xfb':'u', 'xfc':'u',
'xfd':'y', 'xfe':'th', 'xff':'y',
'xa1':'!', 'xa2':'{cent}', 'xa3':'{pound}', 'xa4':'{currency}',
'xa5':'{yen}', 'xa6':'|', 'xa7':'{section}', 'xa8':'{umlaut}',
'xa9':'{C}', 'xaa':'{^a}', 'xab':'<<', 'xac':'{not}',
'xad':'-', 'xae':'{R}', 'xaf':'_', 'xb0':'{degrees}',
'xb1':'{+/-}', 'xb2':'{^2}', 'xb3':'{^3}', 'xb4':'',
'xb5':'{micro}', 'xb6':'{paragraph}', 'xb7':'*', 'xb8':'{cedilla}',
'xb9':'{^1}', 'xba':'{^o}', 'xbb':'>>',
'xbc':'{1/4}', 'xbd':'{1/2}', 'xbe':'{3/4}', 'xbf':'?',
'xd7':'*', 'xf7':'/'
}
unicrap = remove_all ('\\', unicrap)
unicrap = remove_all('&', unicrap)
unicrap = remove_all('u2013', unicrap)
r = unicrap
for item,valor in xlate.items():
#print item, unicrap.find(item)
r = r.replace(item,valor)
return r

python 3.4 UTF 8 string on list element

I will insert this codes to my program but it gives me syntax unicode error because of the Turkish letters. How can I solve this? I cant find any solution on python 3.4
bilissel_olay = [
"Dikkatini çeken nesne/durum/olayı ayrıntılarıyla açıklayabiliyor ve sorular sorabiliyor.",
"Dikkatini çeken nesne/durum/olaya yönelik sorular sorabiliyor.",
"Dikkatini çeken nesne/durum/olaya yönelik sorular sormada ve açıklamada desteğe ihtiyacı var.",
"Dikkatini çeken nesne/durum/olaya yönelik sorular sormada ve açıklamada başarısız."]
bilissel_sayi = [
"İleriye doğru birer birer ritmik sayabiliyor. Saydığı nesnelerin kaç tane olduğunu söyleyebiliyor ve rakamları tanıyabiliyor.",
"Saydığı nesnelerin kaç tane olduğunu söyleyebiliyor ve rakamları tanıyabiliyor.",
"Rakamları tanımakta ve saydığı nesnelerin kaç tane olduğunu söylemekte zorlanıyor."]
bilissel_siniflandirma = [
"Nesne veya varlıkları çeşitli özelliklerine göre gözlemleyebiliyor, eşleştirebiliyor ve gruplayabiliyor.",
"Nesne/varlıkları yapıldığı malzemeye göre gruplayabiliyor. Nesne/varlıkları büyüklüklerine göre sıralayabiliyor.",
"Nesne veya varlıkların tanınmasında desteklenmeli.",
"Nesne veya varlıkları tanımakta zorluk çekiyor."]
print (bilissel_sayi[0])
I get the exception:
SyntaxError: (unicode error) 'utf-8' codec can't decode byte 0xe7 in position 0: unexpected end of data – Melih Muhammet Gundogdu 10 hours ago

If you're on python3, the default encoding should be utf-8, so you should not need to specify it explictly.
If you're on python2, place the following in the header of your code:
# -*- coding: utf-8 -*-
See here for reference.

Python encoding/decoding problems

How do I decode strings such as this one "weren\xe2\x80\x99t" back to the normal encoding.
So this word is actually weren't and not "weren\xe2\x80\x99t"?
For example:
print "\xe2\x80\x9cThings"
string = "\xe2\x80\x9cThings"
print string.decode('utf-8')
print string.encode('ascii', 'ignore')
â€œThings
“Things
Things
But I actually want to get "Things.
or:
print "weren\xe2\x80\x99t"
string = "weren\xe2\x80\x99t"
print string.decode('utf-8')
print string.encode('ascii', 'ignore')
werenâ€™t
weren’t
werent
But I actually want to get weren't.
How should i do this?

I mapped the most common strange chars so this is pretty much complete answer based on the Oliver W. answer.
This function is by no means ideal,but it is the best place to start with.
There are more chars definitions:
http://utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=128&utf8=string
http://www.utf8-chartable.de/unicode-utf8-table.pl?start=128&number=128&names=-&utf8=string-literal
...
def unicodetoascii(text):
uni2ascii = {
ord('\xe2\x80\x99'.decode('utf-8')): ord("'"),
ord('\xe2\x80\x9c'.decode('utf-8')): ord('"'),
ord('\xe2\x80\x9d'.decode('utf-8')): ord('"'),
ord('\xe2\x80\x9e'.decode('utf-8')): ord('"'),
ord('\xe2\x80\x9f'.decode('utf-8')): ord('"'),
ord('\xc3\xa9'.decode('utf-8')): ord('e'),
ord('\xe2\x80\x9c'.decode('utf-8')): ord('"'),
ord('\xe2\x80\x93'.decode('utf-8')): ord('-'),
ord('\xe2\x80\x92'.decode('utf-8')): ord('-'),
ord('\xe2\x80\x94'.decode('utf-8')): ord('-'),
ord('\xe2\x80\x94'.decode('utf-8')): ord('-'),
ord('\xe2\x80\x98'.decode('utf-8')): ord("'"),
ord('\xe2\x80\x9b'.decode('utf-8')): ord("'"),
ord('\xe2\x80\x90'.decode('utf-8')): ord('-'),
ord('\xe2\x80\x91'.decode('utf-8')): ord('-'),
ord('\xe2\x80\xb2'.decode('utf-8')): ord("'"),
ord('\xe2\x80\xb3'.decode('utf-8')): ord("'"),
ord('\xe2\x80\xb4'.decode('utf-8')): ord("'"),
ord('\xe2\x80\xb5'.decode('utf-8')): ord("'"),
ord('\xe2\x80\xb6'.decode('utf-8')): ord("'"),
ord('\xe2\x80\xb7'.decode('utf-8')): ord("'"),
ord('\xe2\x81\xba'.decode('utf-8')): ord("+"),
ord('\xe2\x81\xbb'.decode('utf-8')): ord("-"),
ord('\xe2\x81\xbc'.decode('utf-8')): ord("="),
ord('\xe2\x81\xbd'.decode('utf-8')): ord("("),
ord('\xe2\x81\xbe'.decode('utf-8')): ord(")"),
}
return text.decode('utf-8').translate(uni2ascii).encode('ascii')
print unicodetoascii("weren\xe2\x80\x99t")

In Python 3 I would do it like this:
string = "\xe2\x80\x9cThings"
bytes_string = bytes(string, encoding="raw_unicode_escape")
happy_result = bytes_string.decode("utf-8", "strict")
print(happy_result)
No translation maps needed, just code :)

You should provide a translation map that maps unicode characters to other unicode characters (the latter should be within the ASCII range if you want to re-encode to it):
uni2ascii = {ord('\xe2\x80\x99'.decode('utf-8')): ord("'")}
yourstring.decode('utf-8').translate(uni2ascii).encode('ascii')
print(yourstring) # prints: "weren't"

How I can print characters from various alphabets in python 3?

I am trying to work with cyrillic alphabet and latin alphabet with central european characters, however I am not able to print cyrillic characters. Take a look at the sample code below.
# -*- coding: utf-8 -*-
print("ň")
print("ф")
I've been able to output "ň", once I have set "encoding": "cp1250" in Python.sublime-settings, but unfortunately I have not found any means of displaying cyrillic character.
Thanks for any help.
------------------------Edit--------------------------
meanwhile I've put together code, that works in Ubuntu 13.04 environment but throwing exception in Win 7.
Exception:
Traceback (most recent call last):
File "C:\Users\branislavs\docs\personal\scripts\playground.py", line 6, in <module>
for line in data:
File "C:\Python34\lib\encodings\cp1250.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 51: character maps to <undefined>
Environment:
Win 7
Python 3.4,
Sublime Text 2,
I am trying to output it on windows console.
What should in win to make it work?
I am pasting the code as well:
import sys
char_mapping = ("абвгґдезіийклмнопрстуфъыьцчжшАБВГҐДЕЗIИЙКЛМНОПРСТУФЪЫЬЦЧЖШ",
"abvhgdeziyjklmnoprstuf'ŷ'cčžšABVHGDEZIYJKLMNOPRSTUF'Ŷ'cČŽŠ")
syllable_mapping = {
"special": { #di ti ni li, da ta na la, cja, cji, sja, sji, rja, rji
"ďi": "дї",
"Ďi": "Дї",
"ťi": "тї",
"Ťi": "Тї",
"ňi": "нї",
"Ňi": "Нї",
"ľi": "лї",
"Ľi": "Лї",
"ďa": "дя",
"Ďa": "Дя",
"ťa": "тя",
"Ťa": "Тя",
"ňa": "ня",
"Ňa": "Hя",
"ľa": "ля",
"Ľa": "Ля",
"c'a": "ця",
"c'a": "Ця",
"c'i": "цї",
"C'i": "Цї",
"c'o": "цё",
"C'o": "Цё",
"s'a": "ся",
"S'a": "Ся",
"s'i": "сї",
"S'i": "Сї",
"s'o": "сё",
"S'o": "Сё",
"r'a": "ря",
"R'a": "Ря",
"r'i": "рї",
"R'i": "Рї",
"r'o": "рё",
"R'o": "Рё",
"z'a": "зя",
"Z'a": "Зя",
"z'i": "зї",
"Z'i": "Зї",
"z'o": "зё",
"Z'o": "Зё",
},
"carons": {
"ď": "дь",
"Ď": "Дь",
"ť": "ть",
"Ť": "Ть",
"ň": "нь",
"Ň": "Нь",
"ľ": "ль",
"Ľ": "Ль",
},
"basic" : {
"ja": "я",
"Ja": "Я",
"ju": "ю",
"Ju": "Ю",
"je": "є",
"Je": "Є",
"ch": "х",
"Ch": "X",
"'o": "ё",
"'O": "Ë",
"x": "кc",
"X": "Кc",
"šč": "щ",
"Šč": "Щ",
"ji": "ї",
"c'" : "ць",
"C'" : "Ць",
"s'" : "сь",
"S'" : "Сь",
"r'" : "рь",
"R'" : "Рь",
"z'" : "зь",
"Z'" : "Зь",
}
}
tr_azb_lat = {ord(a):ord(b) for a, b in zip(*char_mapping)}
tr_lat_azb = {ord(b):ord(a) for a, b in zip(*char_mapping)}
def map_syllables_azb_lat(string, mapping_option):
for rule in syllable_mapping[mapping_option]:
string = string.replace(syllable_mapping[mapping_option][rule], rule)
return string
def translit_azb_lat(string):
string = map_syllables_azb_lat(string, 'special')
string = map_syllables_azb_lat(string, 'carons')
string = map_syllables_azb_lat(string, 'basic')
return string.translate(tr_azb_lat).encode('utf-8').decode(sys.stdout.encoding)
def map_syllables_lat_azb(string, mapping_option):
for rule in syllable_mapping[mapping_option]:
string = string.replace(rule, syllable_mapping[mapping_option][rule])
return string
def translit_lat_azb(string):
string = map_syllables_lat_azb(string, 'special')
string = map_syllables_lat_azb(string, 'carons')
string = map_syllables_lat_azb(string, 'basic')
return string.translate(tr_lat_azb).encode('utf-8').decode(sys.stdout.encoding)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Encoding foreign alphabet characters - python

Related

'gbk' codec can't encode character '\u2022' in position 32: illegal multibyte sequence

Python and string accents

python 3.4 UTF 8 string on list element

Python encoding/decoding problems

How I can print characters from various alphabets in python 3?

Categories

Resources