Remove Accent accents from characters using pyspark - python

I have accents in my data and want to remove from character. Example : Frédér8ic# --> frederic
using Pyspark code
I tried the below code:
def simplify(text):
import unicodedata
try:
text = unicode(text, 'utf-8')
except NameError:
pass
text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
return str(text)
But getting below error
text = unicode(text, 'utf-8')
TypeError: decoding str is not supported

def make_trans():
matching_string = ""
replace_string = ""
for i in range(ord(" "), sys.maxunicode):
name = unicodedata.name(chr(i), "")
if "WITH" in name:
try:
base = unicodedata.lookup(name.split(" WITH")[0])
matching_string += chr(i)
replace_string += base
except KeyError:
pass
return matching_string, replace_string
def clean_text(c):
matching_string, replace_string = make_trans()
return translate(
regexp_replace(c, "\p{M}", ""),
matching_string, replace_string
).alias(c)

Related

Python- encode base64 print new line

I want to try to encrypt my python code with base64.
But when I use \n I get an error:
print("hello
IndentationError: unexpected indent
This is my code I used:
import base64
def encode(data):
try:
# Standard Base64 Encoding
encodedBytes = base64.b64encode(data.encode("utf-8"))
return str(encodedBytes, "utf-8")
except:
return ""
def decode(data):
try:
message_bytes = base64.b64decode(data)
return message_bytes.decode('utf-8')
except:
return ""
your_code = encode("""
print("hello \n world")
""")
exec(decode(your_code))
I could use twice the print() function instead of \n but is there a way to use \n?
I hope you can help me out
First, you have to remove the indent in the your_code section. Second you have to replace \n with \\n
import base64
def encode(data):
try:
# Standard Base64 Encoding
encodedBytes = base64.b64encode(data.encode("utf-8"))
return str(encodedBytes, "utf-8")
except:
return ""
def decode(data):
try:
message_bytes = base64.b64decode(data)
return message_bytes.decode('utf-8')
except:
return ""
your_code = encode("""
print("hello \\n world")
""")
exec(decode(your_code))

How to create a list/dict and export the values to Excel from multiple PDF files

I'm doing a Pdfreader that will get the information from Energy Bills PDF in a directory. The program is running and printing exactly as I want to 'store' the information.
The next steps is:
Export to excel the values from each bills, exactly as print in console:
Console image
I've tried lists, dictionaries. Maybe in a wrong way, but none of trials was sucessful.
Any improvements and other ways to do the same, but smarter, is welcome.
Follow the code
import glob
from PyPDF2 import PdfFileReader
pdf_dir = "C:/Users/gabri/Desktop/py4e/Contas EDP AAP/Leitor PDF/Faturas"
pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
path = pdf_files
for file in pdf_files:
with open(file, 'rb') as f:
pdf = PdfFileReader(f)
page = pdf.getPage(1)
text = page.extractText()
text = text.split()
for word in text:
if word.endswith('Créditos'):
word = word.replace("Créditos", "")
word = word.replace("mês", "")
word = word.replace("kWh", "")
energia_injetada = word
print('Energia injetada: ', energia_injetada)
elif word.endswith('Saldo'):
word = word.replace("mês", "")
word = word.replace("Saldo", "")
word = word.replace("Participação", "")
word = word.replace("kWh", "")
energia_injetada = word
if 'Recebido' not in word:
print('Energia injetada: ', energia_injetada)
elif word.endswith('Saldo'):
word = word.replace("Recebido", "")
word = word.replace("kWhSaldo", "")
recebidos = word
print(recebidos)
elif word.endswith('Participação'):
word = word.replace("mês", "")
word = word.replace("Saldo", "")
word = word.replace("Participação", "")
word = word.replace("kWh", "")
energia_injetada = word
print('Saldo Atualizado: ', energia_injetada)
elif word.startswith('Verde'):
print('Bandeira verde')
elif word.startswith('Vermelha:'):
print('Bandeira vermelha')
elif word.startswith('Amarela'):
print('Bandeira Amarela: ')
elif word.startswith('('):
word = word.replace("(", "")
data_inicial = word
print('Data inicial: ', data_inicial)
elif word.endswith(')Nº'):
word = word.replace(")Nº", "")
data_final1 = word
print("Data Final: ", data_final1)
elif word.endswith(")Agradecemos"):
word = word.replace(")Agradecemos", "")
data_final = word
print('Data final: ', data_final)
elif word.startswith('Saldo'):
word = word.replace("Saldo", "")
participacao_saldo = word
print('Participação no Saldo: ', participacao_saldo)
print('\n\n')
else:
continue
if __name__ == '__main__' :
print('ok')
You can use print:
with open('out.csv', 'w') as fileout:
with open(file, 'rb') as f:
#some code
print("hi", "data", file=fileout,sep=";")

Is tokenize.detect_encoding(readline) only in python3?

In python2.7
AttributeError: 'module' object has no attribute 'detect_encoding'
for python2&3 compatibility,you can use:
from lib2to3.pgen2 import tokenize
tokenize.detect_encoding(f.readline)[0] # 'utf-8'
This function isn't available for python2.7, you can see it isn't listed on the https://docs.python.org/2.7/library/tokenize.html. That said, I don't see any reason why the python3.6 version wouldn't work on python2.7, ie:
import re
from codecs import lookup, BOM_UTF8
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
def _get_normal_name(orig_enc):
"""Imitates get_normal_name in tokenizer.c."""
# Only care about the first 12 characters.
enc = orig_enc[:12].lower().replace("_", "-")
if enc == "utf-8" or enc.startswith("utf-8-"):
return "utf-8"
if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
return "iso-8859-1"
return orig_enc
def detect_encoding(readline):
try:
filename = readline.__self__.name
except AttributeError:
filename = None
bom_found = False
encoding = None
default = 'utf-8'
def read_or_stop():
try:
return readline()
except StopIteration:
return b''
def find_cookie(line):
try:
# Decode as UTF-8. Either the line is an encoding declaration,
# in which case it should be pure ASCII, or it must be UTF-8
# per default encoding.
line_string = line.decode('utf-8')
except UnicodeDecodeError:
msg = "invalid or missing encoding declaration"
if filename is not None:
msg = '{} for {!r}'.format(msg, filename)
raise SyntaxError(msg)
match = cookie_re.match(line_string)
if not match:
return None
encoding = _get_normal_name(match.group(1))
try:
codec = lookup(encoding)
except LookupError:
# This behaviour mimics the Python interpreter
if filename is None:
msg = "unknown encoding: " + encoding
else:
msg = "unknown encoding for {!r}: {}".format(filename,
encoding)
raise SyntaxError(msg)
if bom_found:
if encoding != 'utf-8':
# This behaviour mimics the Python interpreter
if filename is None:
msg = 'encoding problem: utf-8'
else:
msg = 'encoding problem for {!r}: utf-8'.format(filename)
raise SyntaxError(msg)
encoding += '-sig'
return encoding
first = read_or_stop()
if first.startswith(BOM_UTF8):
bom_found = True
first = first[3:]
default = 'utf-8-sig'
if not first:
return default, []
encoding = find_cookie(first)
if encoding:
return encoding, [first]
if not blank_re.match(first):
return default, [first]
second = read_or_stop()
if not second:
return default, [first]
encoding = find_cookie(second)
if encoding:
return encoding, [first, second]
return default, [first, second]
coding, lines = detect_encoding(open("out.txt", 'rb').readline)
print(coding, lines)

Unicode Decode Error in Python with files

so I'm having this trouble with the decode. I found it in other threads how to do it for simple strings, with the u'string'.encode. But I can't find a way to make it work with files.
Any help would be appreciated!
Here's the code.
text = file.read()
text.replace(txt.encode('utf-8'), novo_txt.encode('utf-8'))
file.seek(0) # rewind
file.write(text.encode('utf-8'))
and here's the whole code, should it help.
#!/usr/bin/env python
# coding: utf-8
"""
Script to helps on translate some code's methods from
portuguese to english.
"""
from multiprocessing import Pool
from mock import MagicMock
from goslate import Goslate
import fnmatch
import logging
import os
import re
import urllib2
_MAX_PEERS = 1
try:
os.remove('traducoes.log')
except OSError:
pass
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler('traducoes.log')
logger.addHandler(handler)
def fileWalker(ext, dirname, names):
"""
Find the files with the correct extension
"""
pat = "*" + ext[0]
for f in names:
if fnmatch.fnmatch(f, pat):
ext[1].append(os.path.join(dirname, f))
def encontre_text(file):
"""
find on the string the works wich have '_' on it
"""
text = file.read().decode('utf-8')
return re.findall(r"\w+(?<=_)\w+", text)
#return re.findall(r"\"\w+\"", text)
def traduza_palavra(txt):
"""
Translate the word/phrase to english
"""
try:
# try connect with google
response = urllib2.urlopen('http://google.com', timeout=2)
pass
except urllib2.URLError as err:
print "No network connection "
exit(-1)
if txt[0] != '_':
txt = txt.replace('_', ' ')
txt = txt.replace('media'.decode('utf-8'), 'média'.decode('utf-8'))
gs = Goslate()
#txt = gs.translate(txt, 'en', gs.detect(txt))
txt = gs.translate(txt, 'en', 'pt-br') # garantindo idioma tupiniquim
txt = txt.replace(' en ', ' br ')
return txt.replace(' ', '_') # .lower()
def subistitua(file, txt, novo_txt):
"""
should rewrite the file with the new text in the future
"""
text = file.read()
text.replace(txt.encode('utf-8'), novo_txt.encode('utf-8'))
file.seek(0) # rewind
file.write(text.encode('utf-8'))
def magica(File):
"""
Thread Pool. Every single thread should play around here with
one element from list os files
"""
global _DONE
if _MAX_PEERS == 1: # inviavel em multithread
logger.info('\n---- File %s' % File)
with open(File, "r+") as file:
list_txt = encontre_text(file)
for txt in list_txt:
novo_txt = traduza_palavra(txt)
if txt != novo_txt:
logger.info('%s -> %s [%s]' % (txt, novo_txt, File))
subistitua(file, txt, novo_txt)
file.close()
print File.ljust(70) + '[OK]'.rjust(5)
if __name__ == '__main__':
try:
response = urllib2.urlopen('http://www.google.com.br', timeout=1)
except urllib2.URLError as err:
print "No network connection "
exit(-1)
root = './app'
ex = ".py"
files = []
os.path.walk(root, fileWalker, [ex, files])
print '%d files found to be translated' % len(files)
try:
if _MAX_PEERS > 1:
_pool = Pool(processes=_MAX_PEERS)
result = _pool.map_async(magica, files)
result.wait()
else:
result = MagicMock()
result.successful.return_value = False
for f in files:
pass
magica(f)
result.successful.return_value = True
except AssertionError, e:
print e
else:
pass
finally:
if result.successful():
print 'Translated all files'
else:
print 'Some files were not translated'
Thank you all for the help!
In Python 2, reading from files produces regular (byte) string objects, not unicode objects. There is no need to call .encode() on these; in fact, that'll only trigger an automatic decode to Unicode first, which can fail.
Rule of thumb: use a unicode sandwich. Whenever you read data, you decode to unicode at that stage. Use unicode values throughout your code. Whenever you write data, encode at that point. You can use io.open() to open file objects that encode and decode automatically for you.
That also means you can use unicode literals everywhere; for your regular expressions, for your string literals. So use:
def encontre_text(file):
text = file.read() # assume `io.open()` was used
return re.findall(ur"\w+(?<=_)\w+", text) # use a unicode pattern
and
def subistitua(file, txt, novo_txt):
text = file.read() # assume `io.open()` was used
text = text.replace(txt, novo_txt)
file.seek(0) # rewind
file.write(text)
as all string values in the program are already unicode, and
txt = txt.replace(u'media', u'média')
as u'..' unicode string literals don't need decoding anymore.

Python: 'ascii' codec can't encode character u'\\u2026'

I am trying to use the Bing api in python with the following code:
#!/usr/bin/python
from bingapi import bingapi
import re
import json
import urllib
import cgi
import cgitb
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
def strip_tags2(data):
p = re.compile(r'<[^<]*?>')
q = re.compile(r'[&;!##$%^*()]*')
data = p.sub('', data)
return q.sub('', data)
def getUrl(item):
return item['Url']
def getContent(item):
return item['Description']
def getTitle(item):
return item['Title']
def getInfo(qry, siteStr):
qryStr = qry + "+" + siteStr
#qryStr = u"%s" % qryStr.encode('UTF-8')
query = urllib.urlencode({'q' : qryStr})
url = 'http://api.bing.net/json.aspx?Appid=<myappid>&Version=2.2&Market=en-US&Query=%s&Sources=web&Web.Count=10&JsonType=raw' % (query)
search_results = urllib.urlopen(url)
j = json.loads(search_results.read())
results = j['SearchResponse']['Web']['Results']
return results
def updateRecent(qry):
f = open("recent.txt", "r")
lines = f.readlines()
f.close()
lines = lines[1:]
if len(qry) > 50: #truncate if string too long
qry = (qry[:50] + '...')
qry = strip_tags2(qry) #strip out the html if injection try
lines.append("\n%s" % qry)
f = open("recent.txt", "w")
f.writelines(lines)
f.close()
if __name__ == '__main__':
form = cgi.FieldStorage()
qry = form["qry"].value
qry = r'%s' % qry
updateRecent(qry)
siteStr = "(site:answers.yahoo.com OR site:chacha.com OR site:blurtit.com OR site:answers.com OR site:question.com OR site:answerbag.com OR site:stackexchange.com)"
print "Content-type: text/html"
print
header = open("header.html", "r")
contents = header.readlines()
header.close()
for item in contents:
print item
print """
<div id="results">
<center><h1>Results:</h1></center>
"""
for item in getInfo(siteStr, qry):
print "<h3>%s</h3>" % getTitle(item)
print "<br />"
print "%s" % getUrl(item)
print "<br />"
print "<p style=\"color:gray\">%s</p>" % getContent(item)
print "<br />"
print "</div>"
footer = open("footer.html", "r")
contents = footer.readlines()
footer.close()
for thing in contents:
print thing
I prints a few results, and then gives me the following error:
UnicodeEncodeError: 'ascii' codec can't encode character u'\\u2026' in position 72: ordinal not in range(128)
Can someone explain why this is happening? It clearly has something to do with how the url is getting encoded, but what is exactly is wrong? Thanks in advance!
That particular Unicode character is "HORIZONTAL ELLIPSIS". One or more of your getXXXXX() functions are returning Unicode strings, one of which contains a non-ASCII character. I suggest declaring the encoding of your output, for example:
Content-Type: text/html; charset=utf-8
and explicitly encoding your output in that encoding.
We need to know the line number where the exception was thrown, it will be in the backtrace. Anyway, the problem is that you are reading unicode from the files/URLs and then implicitly converting them to US-ASCII, probably in one of the concatenation operations. You should prefix all constant strings with u to indicate that they are unicode strings, like in
u"\n%s" % qry

Categories

Resources