Related
I am getting data from an XML provided by an API that for some reason lists Czechslovak characters in a different encoding (e.g. instead of correct Czechoslovak "ý" it uses "ý"). Therefore, instead of providing the
correct output to the user -> "Zelený"
the output is -> "Zelený"
I went through multiple StackOverflow posts, other fora and tutorials, but I still cannot figure out how to make it turn "Zelený" into "Zelený" (this is just one of the weird characters used by the XML so I cannot use str.replace).
I figured out, that the correct encoding for the Czech/Slovak language is "windows-1250"
My code:
def change_encoding(what):
what = what.encode("windows-1250")
return what
clean_xml_input = change_encoding(xml_input)
This produces error:
'charmap' codec can't encode characters in position 5-6: character
maps to <undefined>
"Zelený".encode("Windows-1252").decode("utf-8") #'Zelený'
"Zelený".encode("windows-1254").decode("utf-8") #'Zelený'
"Zelený".encode("iso-8859-1").decode("utf-8") #'Zelený'
"Zelený".encode("iso-8859-9").decode("utf-8") #'Zelený'
If it is helpful
from itertools import permutations
all_encoding = ['ASMO-708',
'big5',
'cp1025',
'cp866',
'cp875',
'csISO2022JP',
'DOS-720',
'DOS-862',
'EUC-CN',
'EUC-JP',
'euc-jp',
'euc-kr',
'GB18030',
'gb2312',
'hz-gb-2312',
'IBM00858',
'IBM00924',
'IBM01047',
'IBM01140',
'IBM01141',
'IBM01142',
'IBM01143',
'IBM01144',
'IBM01145',
'IBM01146',
'IBM01147',
'IBM01148',
'IBM01149',
'IBM037',
'IBM1026',
'IBM273',
'IBM277',
'IBM278',
'IBM280',
'IBM284',
'IBM285',
'IBM290',
'IBM297',
'IBM420',
'IBM423',
'IBM424',
'IBM437',
'IBM500',
'ibm737',
'ibm775',
'ibm850',
'ibm852',
'IBM855',
'ibm857',
'IBM860',
'ibm861',
'IBM863',
'IBM864',
'IBM865',
'ibm869',
'IBM870',
'IBM871',
'IBM880',
'IBM905',
'IBM-Thai',
'iso-2022-jp',
'iso-2022-jp',
'iso-2022-kr',
'iso-8859-1',
'iso-8859-13',
'iso-8859-15',
'iso-8859-2',
'iso-8859-3',
'iso-8859-4',
'iso-8859-5',
'iso-8859-6',
'iso-8859-7',
'iso-8859-8',
'iso-8859-8-i',
'iso-8859-9',
'Johab',
'koi8-r',
'koi8-u',
'ks_c_5601-1987',
'macintosh',
'shift_jis',
'us-ascii',
'utf-16',
'utf-16BE',
'utf-32',
'utf-32BE',
'utf-7',
'utf-8',
'windows-1250',
'windows-1251',
'Windows-1252',
'windows-1253',
'windows-1254',
'windows-1255',
'windows-1256',
'windows-1257',
'windows-1258',
'windows-874',
'x-Chinese-CNS',
'x-Chinese-Eten',
'x-cp20001',
'x-cp20003',
'x-cp20004',
'x-cp20005',
'x-cp20261',
'x-cp20269',
'x-cp20936',
'x-cp20949',
'x-cp50227',
'x-EBCDIC-KoreanExtended',
'x-Europa',
'x-IA5',
'x-IA5-German',
'x-IA5-Norwegian',
'x-IA5-Swedish',
'x-iscii-as',
'x-iscii-be',
'x-iscii-de',
'x-iscii-gu',
'x-iscii-ka',
'x-iscii-ma',
'x-iscii-or',
'x-iscii-pa',
'x-iscii-ta',
'x-iscii-te',
'x-mac-arabic',
'x-mac-ce',
'x-mac-chinesesimp',
'x-mac-chinesetrad',
'x-mac-croatian',
'x-mac-cyrillic',
'x-mac-greek',
'x-mac-hebrew',
'x-mac-icelandic',
'x-mac-japanese',
'x-mac-korean',
'x-mac-romanian',
'x-mac-thai',
'x-mac-turkish',
'x-mac-ukrainian']
for i,j in permutations(all_encoding, 2):
try:
if("Zelený".encode(i).decode(j) == 'Zelený'):
print(f'encode with `{i}` and decode with `{j}`')
except:
pass
I'm trying to filter from string only the Arabic character but the next function doesn't work for me:
import re
def remove_any_non_arabic_char(text):
non_arabic_char = re.compile('^[\u0627-\u064a]')
text = re.sub(non_arabic_char, "", text)
print(text)
for example:
s = "Kühn xvii, 346] قال جالينوس: [1] قد اتفق جل من فسر هذا الكتا"
The desired output of remove_any_non_arabic_char(s) should be قال جالينوس قد اتفق جل من فسر هذا الكتا but the input stays without changes.
What should I do?
First, you need to fix your regex as suggested in the comments, then for a more efficient solution, you will need to expand your Unicode character selection to include all Arabic character mappings. Finally, you need to keep at least one space between Arabic words to keep the Arabic text legible:
import re
def remove_any_non_arabic_char(text):
non_arabic_char = re.compile('[^\s\\u0600-\u06FF]')
text_with_no_spaces = re.sub(non_arabic_char, "", text)
text_with_single_spaces = " ".join(re.split("\s+", text_with_no_spaces))
return text_with_single_spaces
text_1 = "Kühn xvii, 346] قال جالينوس: [1] قد اتفق جل من فسر هذا الكتا"
text_2 = '''
تغيّر مفهوم كلمة (أدب) من العصر الجاهلي jahili (pre-Islamic) era إلى الآن عبر
مراحل periods التاريخ المتعددة. ففي الجاهلية، كانت كلمة أدب تعني (الدعوة إلى
الطعام). وبعدها، استخدم الرسول محمد (عليه السلام) الكلمة بمعنى "التهذيب والتربية"
education and mannerism. وفي العصر الأموي، اتصلت had to do كلمة أدب
بالتاريخ والفقه والقرآن والحديث. أما في العصرالعباسي، فأصبحت تعني تعلّم الشعر
والنثر prose واتسع الأدب ليشمل أنواع المعرفة وألوانها وخصوصاً علم البلاغة واللغة.
أما في الوقت الحالي، فأصبحت كلمة أدب ذات صلة pertinent بالكلام البليغ
الجميل المؤثر that impacts في أحاسيس القاريء أو السامع.
'''
# Isleem, N. M., & Abuhakema, G. M. (2020). Kalima wa Nagham: A Textbook for
# Teaching Arabic, Volume 2 (Vol. 3). University of Texas Press. (page 5)
print('text_1: \n', remove_any_non_arabic_char(text_1))
print('\ntext_2: \n\n', remove_any_non_arabic_char(text_2))
Running the code on the two texts above in Jupyter, you get:
Notice that punctuation marks shared between Arabic and English (like periods and brackets) have also been removed. To keep those, you would need to introduce more complex conditionals.
I want to create a pdf in my Python application using a text that contains Turkish characters, but I get an error. My codes are below. How can I fix this?
# -*- coding: utf-8 -*-
from fpdf import FPDF
import os
def add_image(image_path):
pdf = FPDF()
pdf.add_page()
epw = pdf.w - 2 * pdf.l_margin
pdf.set_font('Arial', 'B', 14.0)
txt = u'ATATÜRK LİSESİ 2019 2020 EĞİTİM ÖĞRETİM YILI 11C SINIFI'
stxt = txt.encode('iso-8859-9')
pdf.cell(epw, 0.0, stxt, align='C')
I get an 'UnicodeEncodeError: 'latin-1' codec can't encode character '\u0130' in position 60: ordinal not in range(256)' error if I use the codes below
epw = pdf.w - 2 * pdf.l_margin
pdf.set_font('Arial', 'B', 14.0)
txt = 'ATATÜRK LİSESİ 2019 2020 EĞİTİM ÖĞRETİM YILI 11C SINIFI'
#stxt = txt.encode('iso-8859-9')
pdf.cell(epw, 0.0, txt, align='C')
I downloaded 'tr-arial.ttf' font to the application folder and i found this solution:
epw = pdf.w - 2 * pdf.l_margin
txt = u'ATATÜRK LİSESİ 2019 2020 EĞİTİM ÖĞRETİM YILI 11C SINIFI'
pdf.add_font('tr-arial', '', 'tr-arial.ttf', uni=True)
pdf.set_font('tr-arial', '', 11)
pdf.cell(epw, 0.0, txt, align='C')
I wat to print the array eponym but i get this error:
UnicodeEncodeError: 'latin-1' codec can't encode character '\u016b' in
position 1360: ordinal not in range(256)
This is my code
with codecs.open(epoFitx,encoding="utf-8") as fitx:
for line in fitx:
line = line.strip()
eponimo.insert(len(eponimo), line.lower())
print(" ", eponimo)
This is what I obtain:
['roux', 'tourette', 'sulzberger', 'mortimer', 'galeazzi', 'antley',
'henseleit', 'larsen', 'levret', 'bailey', 'schlemm', 'winegrad',
'hoover', 'klein', 'klatskin', 'uhl', 'codman', 'ober', 'b?hme',
'schonberg', 'anitschkow', 'c?stan', 'browne', 'albarran', 'ochsner',
'salzmann', 'cauchois', 'pette', 'michaelis', 'besredka', 'foix',
'bernhardt', 'cruchet', 'wilkins', 'benedict', 'pezzi', 'steinert',
'am?ndola', 'canga', 'oguchi', 'len?gre', 'arndt', 'bennett',
'bagolini', 'morrow', 'politzer', 'w?lfler', 'salomon', 'giaccai',
'dimitri', 'crowe', 'cockett', 'erdheim', 'steele']
I am trying to work with cyrillic alphabet and latin alphabet with central european characters, however I am not able to print cyrillic characters. Take a look at the sample code below.
# -*- coding: utf-8 -*-
print("ň")
print("ф")
I've been able to output "ň", once I have set "encoding": "cp1250" in Python.sublime-settings, but unfortunately I have not found any means of displaying cyrillic character.
Thanks for any help.
------------------------Edit--------------------------
meanwhile I've put together code, that works in Ubuntu 13.04 environment but throwing exception in Win 7.
Exception:
Traceback (most recent call last):
File "C:\Users\branislavs\docs\personal\scripts\playground.py", line 6, in <module>
for line in data:
File "C:\Python34\lib\encodings\cp1250.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 51: character maps to <undefined>
Environment:
Win 7
Python 3.4,
Sublime Text 2,
I am trying to output it on windows console.
What should in win to make it work?
I am pasting the code as well:
import sys
char_mapping = ("абвгґдезіийклмнопрстуфъыьцчжшАБВГҐДЕЗIИЙКЛМНОПРСТУФЪЫЬЦЧЖШ",
"abvhgdeziyjklmnoprstuf'ŷ'cčžšABVHGDEZIYJKLMNOPRSTUF'Ŷ'cČŽŠ")
syllable_mapping = {
"special": { #di ti ni li, da ta na la, cja, cji, sja, sji, rja, rji
"ďi": "дї",
"Ďi": "Дї",
"ťi": "тї",
"Ťi": "Тї",
"ňi": "нї",
"Ňi": "Нї",
"ľi": "лї",
"Ľi": "Лї",
"ďa": "дя",
"Ďa": "Дя",
"ťa": "тя",
"Ťa": "Тя",
"ňa": "ня",
"Ňa": "Hя",
"ľa": "ля",
"Ľa": "Ля",
"c'a": "ця",
"c'a": "Ця",
"c'i": "цї",
"C'i": "Цї",
"c'o": "цё",
"C'o": "Цё",
"s'a": "ся",
"S'a": "Ся",
"s'i": "сї",
"S'i": "Сї",
"s'o": "сё",
"S'o": "Сё",
"r'a": "ря",
"R'a": "Ря",
"r'i": "рї",
"R'i": "Рї",
"r'o": "рё",
"R'o": "Рё",
"z'a": "зя",
"Z'a": "Зя",
"z'i": "зї",
"Z'i": "Зї",
"z'o": "зё",
"Z'o": "Зё",
},
"carons": {
"ď": "дь",
"Ď": "Дь",
"ť": "ть",
"Ť": "Ть",
"ň": "нь",
"Ň": "Нь",
"ľ": "ль",
"Ľ": "Ль",
},
"basic" : {
"ja": "я",
"Ja": "Я",
"ju": "ю",
"Ju": "Ю",
"je": "є",
"Je": "Є",
"ch": "х",
"Ch": "X",
"'o": "ё",
"'O": "Ë",
"x": "кc",
"X": "Кc",
"šč": "щ",
"Šč": "Щ",
"ji": "ї",
"c'" : "ць",
"C'" : "Ць",
"s'" : "сь",
"S'" : "Сь",
"r'" : "рь",
"R'" : "Рь",
"z'" : "зь",
"Z'" : "Зь",
}
}
tr_azb_lat = {ord(a):ord(b) for a, b in zip(*char_mapping)}
tr_lat_azb = {ord(b):ord(a) for a, b in zip(*char_mapping)}
def map_syllables_azb_lat(string, mapping_option):
for rule in syllable_mapping[mapping_option]:
string = string.replace(syllable_mapping[mapping_option][rule], rule)
return string
def translit_azb_lat(string):
string = map_syllables_azb_lat(string, 'special')
string = map_syllables_azb_lat(string, 'carons')
string = map_syllables_azb_lat(string, 'basic')
return string.translate(tr_azb_lat).encode('utf-8').decode(sys.stdout.encoding)
def map_syllables_lat_azb(string, mapping_option):
for rule in syllable_mapping[mapping_option]:
string = string.replace(rule, syllable_mapping[mapping_option][rule])
return string
def translit_lat_azb(string):
string = map_syllables_lat_azb(string, 'special')
string = map_syllables_lat_azb(string, 'carons')
string = map_syllables_lat_azb(string, 'basic')
return string.translate(tr_lat_azb).encode('utf-8').decode(sys.stdout.encoding)