extract specific words from text - python

I have the text below:
I have this text < class 'str'>
"pois":[{"addr":"重庆市江北区长兴路2号","cp":" ","direction":"附近","distance":"42","name":"金融街新融御","poiType":"房地产","point":{"x":106.56036820713899,"y":29.577630842401097},"tag":"房地产;住宅区","tel":"","uid":"71785bc6fdf697313fe2315e","zip":"","parent_poi":{"name":"","tag":"","addr":"","point":{"x":0.0,"y":0.0},"direction":"","distance":"","uid":""}},{"addr":"重庆市江北区长兴路2号","cp":" ","direction":"东北","distance":"123","name":"乐融幼儿园","poiType":"教育培训","point":{"x":106.55956871523539,"y":29.57748164058615},"tag":"教育培训;幼儿园","tel":"","uid":"ae5c28352429035ed3ff2381","zip":"","parent_poi":{"name":"","tag":"","addr":"","point":{"x":0.0,"y":0.0},"direction":"","distance":"","uid":""}},{"addr":"重庆市江北区长兴路2号","cp":" ","direction":"东","distance":"58","name":"龙湖春森彼岸2-5栋","poiType":"房地产","point":{"x":106.56003583410042,"y":29.577929245362655},"tag":"房地产;内部楼栋","tel":"","uid":"2982a6e5b02290c58f1a89cb","zip":"","parent_poi":{"name":"龙湖·春森彼岸","tag":"房地产;住宅区","addr":"重庆市江北区北滨一路258号","point":{"x":106.55829312141164,"y":29.576979064412556},"direction":"东北","distance":"279","uid":"9de85c83b09b837888f15d98"}},{"addr":"重庆市江北区长新路与刘家台正街交叉路口往西北约160米","cp":" ","direction":"东","distance":"108","name":"龙湖·春森彼岸-8号车库","poiType":"交通设施","point":{"x":106.55962261356597,"y":29.578141267978038},"tag":"交通设施;停车场","tel":"","uid":"63b440710aa3128fadee7a4a","zip":"","parent_poi":{"name":"龙湖·春森彼岸","tag":"房地产;住宅区","addr":"重庆市江北区北滨一路258号","point":{"x":106.55829312141164,"y":29.576979064412556},"direction":"东北","distance":"279","uid":"9de85c83b09b837888f15d98"}},{"addr":"重庆市江北区长兴路2号","cp":" ","direction":"东","distance":"113","name":"龙湖·春森彼岸-北门","poiType":"出入口","point":{"x":106.55957769829048,"y":29.578149120658855},"tag":"出入口;门","tel":"","uid":"4eabe73ddd66fc9a3a13fd89","zip":"","parent_poi":{"name":"龙湖·春森彼岸","tag":"房地产;住宅区","addr":"重庆市江北区北滨一路258号","point":{"x":106.55829312141164,"y":29.576979064412556},"direction":"东北","distance":"279","uid":"9de85c83b09b837888f15d98"}},{"addr":"北滨路陈家馆A宗E宗","cp":" ","direction":"东","distance":"154","name":"龙湖春森彼岸2-8栋","poiType":"房地产","point":{"x":106.55917346081113,"y":29.577944950757006},"tag":"房地产;内部楼栋","tel":"","uid":"1e3f326e817ccb9fc909f036","zip":"","parent_poi":{"name":"龙湖·春森彼岸","tag":"房地产;住宅区","addr":"重庆市江北区北滨一路258号","point":{"x":106.55829312141164,"y":29.576979064412556},"direction":"东北","distance":"279","uid":"9de85c83b09b837888f15d98"}},{"addr":"重庆市江北区长新路与刘家台正街交叉路口往西约120米","cp":" ","direction":"东北","distance":"197","name":"龙湖星悦荟8区(龙湖星悦荟店)-停车库","poiType":"交通设施","point":{"x":106.55942498635383,"y":29.57672777537788},"tag":"交通设施;停车场","tel":"","uid":"868395aeca61ca754251c078","zip":"","parent_poi":{"name":"龙湖星悦荟8区(龙湖星悦荟店)","tag":"购物;购物中心".....
I want to extract :
"tag":"房地产;住宅"
"tag":"教育培训;幼儿园"
"tag":"房地产;内部楼栋"
"tag":"交通设施;停车场"
"tag":"购物;购物中心"
I have tried :
startString = '"tag":'
endString = ',"tel"'''
mySubString.append(result[result.find(startString)+len(startString):result.find(endString)])
but couldn't iterate over the whole text.

I hope, this works for your solution. I used regex to extract these tags
import re
extract_str = [
'"tag":"房地产;住宅"', '"tag":"教育培训;幼儿园"', '"tag":"房地产;内部楼栋"',
'"tag":"交通设施;停车场"', '"tag":"购物;购物中心"'
]
with open('./extract_specific_from_text.txt', 'r', encoding="utf8") as f:
file_txt = f.read()
for i in extract_str:
tags = re.findall(i, file_txt)
if len(tags) > 0:
print(f'{list(set(tags))[0]} found in the file')
else:
print(f'{i} not found in the file')

You can try something like:
import json
data = json.loads(data)
print([el['tag'] for el in data["pois"]])
Which will print all the tag in the json.
# Output
['房地产;住宅区', '教育培训;幼儿园', '房地产;内部楼栋', '交通设施;停车场', '出入口;门', '房地产;内部楼栋', '交通设施;停车场']

Related

how to get the contents in brackets in a file Python

I am working with files right now and i want to get text from a bracket this is what i mean by getting text from a brackets...
{
this is text for a
this is text for a
this is text for a
this is text for a
}
[
this is text for b
this is text for b
this is text for b
this is text for b
]
The content in a is this is text for a and the content for b is is text for b
my code seems to not be printing the contents in a properly it show a&b my file.
My code:
with open('file.txt','r') as read_obj:
for line in read_obj.readlines():
var = line[line.find("{")+1:line.rfind("}")]
print(var)
iterate over the file
for each line check the first character
if the first character is either '[' or '{' start accumulating lines
if the first character is either ']' or '}' stop accumulating lines
a_s = []
b_s = []
capture = False
group = None
with open(path) as f:
for line in f:
if capture: group.append(line)
if line[0] in '{[':
capture = True
group = a_s if line[0] == '{' else b_s
elif line[0] in '}]':
capture = False
group = None
print(a_s)
print(b_s)
Relies on the file to be structured exactly as shown in the example.
This is what regular expressions are made for. Python has a built-in module named re to perform regular expression queries.
In your case, simply:
import re
fname = "foo.txt"
# Read data into a single string
with open(fname, "r") as f:
data = f.read()
# Remove newline characters from the string
data = re.sub(r"\n", "", data)
# Define pattern for type A
pattern_a = r"\{(.*?)\}"
# Print text of type A
print(re.findall(pattern_a, data))
# Define pattern for type B
pattern_b = r"\[(.*?)\]"
# Print text of type B
print(re.findall(pattern_b, data))
Output:
['this is text for athis is text for athis is text for athis is text for a']
['this is text for bthis is text for bthis is text for bthis is text for b']
Read the file and split the content to a list.
Define a brackets list and exclude them through a loop and write the rest to a file.
file_obj = open("content.txt", 'r')
content_list = file_obj.read().splitlines()
brackets = ['[', ']', '{', '}']
for i in content_list:
if i not in brackets:
writer = open("new_content.txt", 'a')
writer.write(i+ '\n')
writer.close()
f1=open('D:\\Tests 1\\t1.txt','r')
for line in f1.readlines():
flag=0
if line.find('{\n') or line.find('[\n'):
flag=1
elif line.find('}\n') or line.find(']\n'):
flag=0
if flag==1:
print(line.split('\n')[0])

missing some text when parsing article content with lxml, what is the issue?

I am using the following code to parse an article from a french news site. When getting all the paragraphs, i keep missing some text. why is that?
Here is my code: the code with the XX is the most relevant the other parts is just me putting it in my own structure for use.
def getWordList(sent,wordList):
listOfWords = list((sent).split())
for i in listOfWords:
i = i.replace("."," ")
i = i.replace(","," ")
i = i.replace('\"'," ")
valids = re.sub(r"[^A-Za-z]+", '', i)
if(len(i) > 3 and (i.lower() not in stopWords) and i.isnumeric() !=
True and valids):
wordList[valids] = {}
wordList[valids]["definition"] = ""
wordList[valids]["status"] = ""
def parse(link):
page = requests.get(link)
tree = html.fromstring(page.content)
XXword = tree.xpath('//*[#class="article__content old__article-content-single"]')
articleContent = {}
articleContent["words"] = {}
articleContent["language"] = "French";
wordList = articleContent["words"]
contentList = []
XXpTag = word[0].xpath('//*')
pText = {}
for x in range(len(pTag)):
#print(pTag[x].get("class"))
if(pTag[x].text != None):
if(pTag[x].tail != None):
print("tail")
XXtext = pTag[x].text + pTag[x].tail
else:
print("no tail")
XXtext = pTag[x].text
XXif(pTag[x].get("class") == "article__paragraph "):
print(pTag[x].get("class"))
print(text)
getWordList(text,wordList)
pText[text] = {}
pText[text]["status"] = ""
pText[text]["type"] = "p"
XXelif(pTag[x].get("class") == "article__sub-title"):
print(pTag[x].get("class"))
getWordList(text,wordList)
pText[text] = {}
pText[text]["status"] = ""
pText[text]["type"] = "h2"
here is an example article link: https://www.lemonde.fr/economie/article/2019/05/23/vivendi-chercherait-a-ceder-universal-music-group-au-chinois-tencent_5466130_3234.html
I am successfully getting all the highlighted text but the rest is missing,not the text in the middle i am successfully avoiding that. I just want the text in between which is not being included.
Thank you for your help!!
You're trying to get the content of tags containing other tags. For example, there are <em> emphasized text tags in the <p> paragraph tags.
Use the text_content() method instead of text to get the full content of your paragraphs:
text = pTag[x].text_content() + pTag[x].tail
and
text = pTag[x].text_content()

Read CSV file and filter results

Im writing a script where one of its functions is to read a CSV file that contain URLs on one of its rows. Unfortunately the system that create those CSVs doesn't put double-quotes on values inside the URL column so when the URL contain commas it breaks all my csv parsing.
This is the code I'm using:
with open(accesslog, 'r') as csvfile, open ('results.csv', 'w') as enhancedcsv:
reader = csv.DictReader(csvfile)
for row in reader:
self.uri = (row['URL'])
self.OriCat = (row['Category'])
self.query(self.uri)
print self.URL+","+self.ServerIP+","+self.OriCat+","+self.NewCat"
This is a sample URL that is breaking up the parsing - this URL comes on the row named "URL". (note the commas at the end)
ams1-ib.adnxs.com/ww=1238&wh=705&ft=2&sv=43&tv=view5-1&ua=chrome&pl=mac&x=1468251839064740641,439999,v,mac,webkit_chrome,view5-1,0,,2,
The following row after the URL always come with a numeric value between parenthesis. Ex: (9999) so this could be used to define when the URL with commas end.
How can i deal with a situation like this using the csv module?
You will have to do it a little more manually. Try this
def process(lines, delimiter=','):
header = None
url_index_from_start = None
url_index_from_end = None
for line in lines:
if not header:
header = [l.strip() for l in line.split(delimiter)]
url_index_from_start = header.index('URL')
url_index_from_end = len(header)-url_index_from_start
else:
data = [l.strip() for l in line.split(delimiter)]
url_from_start = url_index_from_start
url_from_end = len(data)-url_index_from_end
values = data[:url_from_start] + data[url_from_end+1:] + [delimiter.join(data[url_from_start:url_from_end+1])]
keys = header[:url_index_from_start] + header[url_index_from_end+1:] + [header[url_index_from_start]]
yield dict(zip(keys, values))
Usage:
lines = ['Header1, Header2, URL, Header3',
'Content1, "Content2", abc,abc,,abc, Content3']
result = list(process(lines))
assert result[0]['Header1'] == 'Content1'
assert result[0]['Header2'] == '"Content2"'
assert result[0]['Header3'] == 'Content3'
assert result[0]['URL'] == 'abc,abc,,abc'
print(result)
Result:
>>> [{'URL': 'abc,abc,,abc', 'Header2': '"Content2"', 'Header3': 'Content3', 'Header1': 'Content1'}]
Have you considered using Pandas to read your data in?
Another possible solution would be to use regular expressions to pre-process the data...
#make a list of everything you want to change
old = re.findall(regex, f.read())
#append quotes and create a new list
new = []
for url in old:
url2 = "\""+url+"\""
new.append(url2)
#combine the lists
old_new = list(zip(old,new))
#Then use the list to update the file:
f = open(filein,'r')
filedata = f.read()
f.close()
for old,new in old_new:
newdata = filedata.replace(old,new)
f = open(filein,'w')
f.write(newdata)
f.close()

Parse HTML table data to JSON and save to text file in Python 2.7

I'm trying to extract the data on the crime rate across states from
this webpage, link to web page
http://www.disastercenter.com/crime/uscrime.htm
I am able to get this into text file. But I would like to get the
response in Json format. How can I do this in python.
Here is my code:
import urllib
import re
from bs4 import BeautifulSoup
link = "http://www.disastercenter.com/crime/uscrime.htm"
f = urllib.urlopen(link)
myfile = f.read()
soup = BeautifulSoup(myfile)
soup1=soup.find('table', width="100%")
soup3=str(soup1)
result = re.sub("<.*?>", "", soup3)
print(result)
output=open("output.txt","w")
output.write(result)
output.close()
The following code will get the data from the two tables and output all of it as a json formatted string.
Working Example (Python 2.7.9):
from lxml import html
import requests
import re as regular_expression
import json
page = requests.get("http://www.disastercenter.com/crime/uscrime.htm")
tree = html.fromstring(page.text)
tables = [tree.xpath('//table/tbody/tr[2]/td/center/center/font/table/tbody'),
tree.xpath('//table/tbody/tr[5]/td/center/center/font/table/tbody')]
tabs = []
for table in tables:
tab = []
for row in table:
for col in row:
var = col.text_content()
var = var.strip().replace(" ", "")
var = var.split('\n')
if regular_expression.match('^\d{4}$', var[0].strip()):
tab_row = {}
tab_row["Year"] = var[0].strip()
tab_row["Population"] = var[1].strip()
tab_row["Total"] = var[2].strip()
tab_row["Violent"] = var[3].strip()
tab_row["Property"] = var[4].strip()
tab_row["Murder"] = var[5].strip()
tab_row["Forcible_Rape"] = var[6].strip()
tab_row["Robbery"] = var[7].strip()
tab_row["Aggravated_Assault"] = var[8].strip()
tab_row["Burglary"] = var[9].strip()
tab_row["Larceny_Theft"] = var[10].strip()
tab_row["Vehicle_Theft"] = var[11].strip()
tab.append(tab_row)
tabs.append(tab)
json_data = json.dumps(tabs)
output = open("output.txt", "w")
output.write(json_data)
output.close()
This might be what you want, if you can use the requests and lxml modules. The data structure presented here is very simple, adjust this to your needs.
First, get a response from your requested URL and parse the result into an HTML tree:
import requests
from lxml import etree
import json
response = requests.get("http://www.disastercenter.com/crime/uscrime.htm")
tree = etree.HTML(response.text)
Assuming you want to extract both tables, create this XPath and unpack the results. totals is "Number of Crimes" and rates is "Rate of Crime per 100,000 People":
xpath = './/table[#width="100%"][#style="background-color: rgb(255, 255, 255);"]//tbody'
totals, rates = tree.findall(xpath)
Extract the raw data (td.find('./') means first child item, whatever tag it has) and clean the strings (r'' raw strings are needed for Python 2.x):
raw_data = []
for tbody in totals, rates:
rows = []
for tr in tbody.getchildren():
row = []
for td in tr.getchildren():
child = td.find('./')
if child is not None and child.tag != 'br':
row.append(child.text.strip(r'\xa0').strip(r'\n').strip())
else:
row.append('')
rows.append(row)
raw_data.append(rows)
Zip together the table headers in the first two rows, then delete the redundant rows, seen as the 11th & 12th steps in slice notation:
data = {}
data['tags'] = [tag0 + tag1 for tag0, tag1 in zip(raw_data[0][0], raw_data[0][1])]
for raw in raw_data:
del raw[::12]
del raw[::11]
Store the rest of the raw data and create a JSON file (optional: eliminate whitespace with separators=(',', ':')):
data['totals'], data['rates'] = raw_data[0], raw_data[1]
with open('data.json', 'w') as f:
json.dump(data, f, separators=(',', ':'))

lxml encoding changes while appending string to list

I'm using lxml and etree to parse an html-file. The code looks like this:
def get_all_languages():
allLanguages = "http://wold.livingsources.org/vocabulary"
f = urllib.urlopen(allLanguages).read()
#inFile = "imLog.xml"
#html = f.read()
#f.close()
#encoding = chardet.detect(f)['encoding']
#f.decode(encoding, 'replace').encode('utf-8')
html = etree.HTML(f)
#result = etree.tostring(html, pretty_print=True, method="html")
#print result #is of type string
#print type(result)
return html
Than, I'm extracting some information from the webside and save it in an array.
While appending the string to an array, the encoding or format of the string changes. I think is some kind of unicode object or so??
So I thought it might not be challenging for me, because I remove it from the array and print it in an output file.
def print_file():
#outputfile
output = 'WOLDDictionaries.txt'
dataLanguage = get_data_all_languages()
dataDictionaries = get_site_single_lang()
outputFile = open(output, "w")
outputFile.flush()
for index, array in enumerate(dataLanguage):
indexLang = index
for item in array:
string = item
#indexLang = index
outputFile.write(string + "\t")
outputFile.write("\n")
#outputFile.flush()
for index, array in enumerate(dataDictionaries):
#stringArray = str(array)
indexDic = index
#outputFile.write(index + stringArray + "\t")
if(indexLang == indexDic):
#outputFile.write(string + "\t")
for data in array:
#stringData = str(data)
#outputFile.write(stringData + "\t")
for word in data:
stringWord = word
outputFile.write(stringWord + "\t")
outputFile.write("\n")
#outputFile.flush()
outputFile.close()
Well this thougth was wrong. While printing it to a file, the encoding is still wrong.
What can I do to get the right characters?
Ok I found the problem. It was in the parser.
I changed the encoding in the parser and got the right encoding in the end.
The mothod now looks like this:
myparser = etree.HTMLParser(encoding = 'utf-8')
html = etree.HTML(f, parser=myparser)
#print etree.tostring(xml, pretty_print=True, method="xml")
#print type(xml)
return html

Categories

Resources