lxml encoding changes while appending string to list - python

I'm using lxml and etree to parse an html-file. The code looks like this:
def get_all_languages():
allLanguages = "http://wold.livingsources.org/vocabulary"
f = urllib.urlopen(allLanguages).read()
#inFile = "imLog.xml"
#html = f.read()
#f.close()
#encoding = chardet.detect(f)['encoding']
#f.decode(encoding, 'replace').encode('utf-8')
html = etree.HTML(f)
#result = etree.tostring(html, pretty_print=True, method="html")
#print result #is of type string
#print type(result)
return html
Than, I'm extracting some information from the webside and save it in an array.
While appending the string to an array, the encoding or format of the string changes. I think is some kind of unicode object or so??
So I thought it might not be challenging for me, because I remove it from the array and print it in an output file.
def print_file():
#outputfile
output = 'WOLDDictionaries.txt'
dataLanguage = get_data_all_languages()
dataDictionaries = get_site_single_lang()
outputFile = open(output, "w")
outputFile.flush()
for index, array in enumerate(dataLanguage):
indexLang = index
for item in array:
string = item
#indexLang = index
outputFile.write(string + "\t")
outputFile.write("\n")
#outputFile.flush()
for index, array in enumerate(dataDictionaries):
#stringArray = str(array)
indexDic = index
#outputFile.write(index + stringArray + "\t")
if(indexLang == indexDic):
#outputFile.write(string + "\t")
for data in array:
#stringData = str(data)
#outputFile.write(stringData + "\t")
for word in data:
stringWord = word
outputFile.write(stringWord + "\t")
outputFile.write("\n")
#outputFile.flush()
outputFile.close()
Well this thougth was wrong. While printing it to a file, the encoding is still wrong.
What can I do to get the right characters?

Ok I found the problem. It was in the parser.
I changed the encoding in the parser and got the right encoding in the end.
The mothod now looks like this:
myparser = etree.HTMLParser(encoding = 'utf-8')
html = etree.HTML(f, parser=myparser)
#print etree.tostring(xml, pretty_print=True, method="xml")
#print type(xml)
return html

Related

extract specific words from text

I have the text below:
I have this text < class 'str'>
"pois":[{"addr":"重庆市江北区长兴路2号","cp":" ","direction":"附近","distance":"42","name":"金融街新融御","poiType":"房地产","point":{"x":106.56036820713899,"y":29.577630842401097},"tag":"房地产;住宅区","tel":"","uid":"71785bc6fdf697313fe2315e","zip":"","parent_poi":{"name":"","tag":"","addr":"","point":{"x":0.0,"y":0.0},"direction":"","distance":"","uid":""}},{"addr":"重庆市江北区长兴路2号","cp":" ","direction":"东北","distance":"123","name":"乐融幼儿园","poiType":"教育培训","point":{"x":106.55956871523539,"y":29.57748164058615},"tag":"教育培训;幼儿园","tel":"","uid":"ae5c28352429035ed3ff2381","zip":"","parent_poi":{"name":"","tag":"","addr":"","point":{"x":0.0,"y":0.0},"direction":"","distance":"","uid":""}},{"addr":"重庆市江北区长兴路2号","cp":" ","direction":"东","distance":"58","name":"龙湖春森彼岸2-5栋","poiType":"房地产","point":{"x":106.56003583410042,"y":29.577929245362655},"tag":"房地产;内部楼栋","tel":"","uid":"2982a6e5b02290c58f1a89cb","zip":"","parent_poi":{"name":"龙湖·春森彼岸","tag":"房地产;住宅区","addr":"重庆市江北区北滨一路258号","point":{"x":106.55829312141164,"y":29.576979064412556},"direction":"东北","distance":"279","uid":"9de85c83b09b837888f15d98"}},{"addr":"重庆市江北区长新路与刘家台正街交叉路口往西北约160米","cp":" ","direction":"东","distance":"108","name":"龙湖·春森彼岸-8号车库","poiType":"交通设施","point":{"x":106.55962261356597,"y":29.578141267978038},"tag":"交通设施;停车场","tel":"","uid":"63b440710aa3128fadee7a4a","zip":"","parent_poi":{"name":"龙湖·春森彼岸","tag":"房地产;住宅区","addr":"重庆市江北区北滨一路258号","point":{"x":106.55829312141164,"y":29.576979064412556},"direction":"东北","distance":"279","uid":"9de85c83b09b837888f15d98"}},{"addr":"重庆市江北区长兴路2号","cp":" ","direction":"东","distance":"113","name":"龙湖·春森彼岸-北门","poiType":"出入口","point":{"x":106.55957769829048,"y":29.578149120658855},"tag":"出入口;门","tel":"","uid":"4eabe73ddd66fc9a3a13fd89","zip":"","parent_poi":{"name":"龙湖·春森彼岸","tag":"房地产;住宅区","addr":"重庆市江北区北滨一路258号","point":{"x":106.55829312141164,"y":29.576979064412556},"direction":"东北","distance":"279","uid":"9de85c83b09b837888f15d98"}},{"addr":"北滨路陈家馆A宗E宗","cp":" ","direction":"东","distance":"154","name":"龙湖春森彼岸2-8栋","poiType":"房地产","point":{"x":106.55917346081113,"y":29.577944950757006},"tag":"房地产;内部楼栋","tel":"","uid":"1e3f326e817ccb9fc909f036","zip":"","parent_poi":{"name":"龙湖·春森彼岸","tag":"房地产;住宅区","addr":"重庆市江北区北滨一路258号","point":{"x":106.55829312141164,"y":29.576979064412556},"direction":"东北","distance":"279","uid":"9de85c83b09b837888f15d98"}},{"addr":"重庆市江北区长新路与刘家台正街交叉路口往西约120米","cp":" ","direction":"东北","distance":"197","name":"龙湖星悦荟8区(龙湖星悦荟店)-停车库","poiType":"交通设施","point":{"x":106.55942498635383,"y":29.57672777537788},"tag":"交通设施;停车场","tel":"","uid":"868395aeca61ca754251c078","zip":"","parent_poi":{"name":"龙湖星悦荟8区(龙湖星悦荟店)","tag":"购物;购物中心".....
I want to extract :
"tag":"房地产;住宅"
"tag":"教育培训;幼儿园"
"tag":"房地产;内部楼栋"
"tag":"交通设施;停车场"
"tag":"购物;购物中心"
I have tried :
startString = '"tag":'
endString = ',"tel"'''
mySubString.append(result[result.find(startString)+len(startString):result.find(endString)])
but couldn't iterate over the whole text.
I hope, this works for your solution. I used regex to extract these tags
import re
extract_str = [
'"tag":"房地产;住宅"', '"tag":"教育培训;幼儿园"', '"tag":"房地产;内部楼栋"',
'"tag":"交通设施;停车场"', '"tag":"购物;购物中心"'
]
with open('./extract_specific_from_text.txt', 'r', encoding="utf8") as f:
file_txt = f.read()
for i in extract_str:
tags = re.findall(i, file_txt)
if len(tags) > 0:
print(f'{list(set(tags))[0]} found in the file')
else:
print(f'{i} not found in the file')
You can try something like:
import json
data = json.loads(data)
print([el['tag'] for el in data["pois"]])
Which will print all the tag in the json.
# Output
['房地产;住宅区', '教育培训;幼儿园', '房地产;内部楼栋', '交通设施;停车场', '出入口;门', '房地产;内部楼栋', '交通设施;停车场']

How to fix errors IndexError: list index out of range

I would like load data which are 10 categories of document, each cateory contains text files, but I keep getting the following error:
IndexError: list index out of range
THis is code :
def load_data(folder):
data = []
files = [join(folder, x) for x in os.listdir(folder)]
for file in files:
topic = file.split("/")[9] # this is where the error occurs
label = topic.replace(" ", "_")
name = "__label__" + label
with open(file, "rb") as f:
content = f.read()
content = content.decode('utf-16')
content = " ".join(i for i in content.split())
data.append(name + " " + content)
return data
Easy way to debug this would be to add print statements and check what the objects hold. For e.g. in this case, you can add 2 print statements at the beginning of the for loop. This would help you to figure out why you are getting IndexError
def load_data(folder):
data = []
files = [join(folder, x) for x in os.listdir(folder)]
for file in files:
print(file)
print(file.split("/"))
topic = file.split("/")[9] # this is where the error occurs
label = topic.replace(" ", "_")
name = "__label__" + label
with open(file, "rb") as f:
content = f.read()
content = content.decode('utf-16')
content = " ".join(i for i in content.split())
data.append(name + " " + content)
return data

Looping over a csv, trying to write to another csv

I'm looping over a csv of links, visiting those links, and then trying to write information from those links to a new file:
with open("hrefs.csv", "rb") as f:
reader = csv.reader(f)
for row in reader:
newUrl = row[0]
response = requests.get(newUrl)
newData = response.text
newSoup = BeautifulSoup(newData, 'lxml')
newstring = ''
titles = newSoup.findAll('span', {'id': 'titletextonly'})
prices = newSoup.findAll('span', {'class': 'price'})
newstring += titles[0].text + ',' + prices[0].text + ','
for ana in newSoup.findAll('p',{'class':'attrgroup'}):
for myb in ana.findAll('b'):
newstring += myb.text + ','
print newstring
listFile = open("output.csv", 'wb')
writer = csv.writer(listFile)
writer.writerow(newstring.encode('ascii', 'ignore').decode('ascii'))
There are a couple problems I'm running into. First, I thought the csv would realize that there are comma separated values and put each attribute in a new column. Second, it seems that one letter is getting put in each column. When I simple print each newstring it is giving me a coherent string.
You need to give writer.writerow a sequence of strings:
writer.writerow(newstring.split(","))
would be the easiest change from what you currently have.

Read CSV file and filter results

Im writing a script where one of its functions is to read a CSV file that contain URLs on one of its rows. Unfortunately the system that create those CSVs doesn't put double-quotes on values inside the URL column so when the URL contain commas it breaks all my csv parsing.
This is the code I'm using:
with open(accesslog, 'r') as csvfile, open ('results.csv', 'w') as enhancedcsv:
reader = csv.DictReader(csvfile)
for row in reader:
self.uri = (row['URL'])
self.OriCat = (row['Category'])
self.query(self.uri)
print self.URL+","+self.ServerIP+","+self.OriCat+","+self.NewCat"
This is a sample URL that is breaking up the parsing - this URL comes on the row named "URL". (note the commas at the end)
ams1-ib.adnxs.com/ww=1238&wh=705&ft=2&sv=43&tv=view5-1&ua=chrome&pl=mac&x=1468251839064740641,439999,v,mac,webkit_chrome,view5-1,0,,2,
The following row after the URL always come with a numeric value between parenthesis. Ex: (9999) so this could be used to define when the URL with commas end.
How can i deal with a situation like this using the csv module?
You will have to do it a little more manually. Try this
def process(lines, delimiter=','):
header = None
url_index_from_start = None
url_index_from_end = None
for line in lines:
if not header:
header = [l.strip() for l in line.split(delimiter)]
url_index_from_start = header.index('URL')
url_index_from_end = len(header)-url_index_from_start
else:
data = [l.strip() for l in line.split(delimiter)]
url_from_start = url_index_from_start
url_from_end = len(data)-url_index_from_end
values = data[:url_from_start] + data[url_from_end+1:] + [delimiter.join(data[url_from_start:url_from_end+1])]
keys = header[:url_index_from_start] + header[url_index_from_end+1:] + [header[url_index_from_start]]
yield dict(zip(keys, values))
Usage:
lines = ['Header1, Header2, URL, Header3',
'Content1, "Content2", abc,abc,,abc, Content3']
result = list(process(lines))
assert result[0]['Header1'] == 'Content1'
assert result[0]['Header2'] == '"Content2"'
assert result[0]['Header3'] == 'Content3'
assert result[0]['URL'] == 'abc,abc,,abc'
print(result)
Result:
>>> [{'URL': 'abc,abc,,abc', 'Header2': '"Content2"', 'Header3': 'Content3', 'Header1': 'Content1'}]
Have you considered using Pandas to read your data in?
Another possible solution would be to use regular expressions to pre-process the data...
#make a list of everything you want to change
old = re.findall(regex, f.read())
#append quotes and create a new list
new = []
for url in old:
url2 = "\""+url+"\""
new.append(url2)
#combine the lists
old_new = list(zip(old,new))
#Then use the list to update the file:
f = open(filein,'r')
filedata = f.read()
f.close()
for old,new in old_new:
newdata = filedata.replace(old,new)
f = open(filein,'w')
f.write(newdata)
f.close()

Modifying file content using python

I have a file that consists of a single line:
a,x,b,c,d,e
I want to convert this into
a,x,b,x,c,x,d,x,e,x
Is there any easy way to achieve this with python?
my_file = open(filename)
data = my_file.read()
data = data.split(',')
str = ''
for each in data:
if each != 'x':
str += each + ',' + 'x' + ','
str= str.strip(',')
print str
import re
s = open(filename).read()
open(filename, 'w').write(',x,'.join(re.findall(r'[a-wyz]', s)) + ',x\n')

Categories

Resources