I have the text below:
I have this text < class 'str'>
"pois":[{"addr":"重庆市江北区长兴路2号","cp":" ","direction":"附近","distance":"42","name":"金融街新融御","poiType":"房地产","point":{"x":106.56036820713899,"y":29.577630842401097},"tag":"房地产;住宅区","tel":"","uid":"71785bc6fdf697313fe2315e","zip":"","parent_poi":{"name":"","tag":"","addr":"","point":{"x":0.0,"y":0.0},"direction":"","distance":"","uid":""}},{"addr":"重庆市江北区长兴路2号","cp":" ","direction":"东北","distance":"123","name":"乐融幼儿园","poiType":"教育培训","point":{"x":106.55956871523539,"y":29.57748164058615},"tag":"教育培训;幼儿园","tel":"","uid":"ae5c28352429035ed3ff2381","zip":"","parent_poi":{"name":"","tag":"","addr":"","point":{"x":0.0,"y":0.0},"direction":"","distance":"","uid":""}},{"addr":"重庆市江北区长兴路2号","cp":" ","direction":"东","distance":"58","name":"龙湖春森彼岸2-5栋","poiType":"房地产","point":{"x":106.56003583410042,"y":29.577929245362655},"tag":"房地产;内部楼栋","tel":"","uid":"2982a6e5b02290c58f1a89cb","zip":"","parent_poi":{"name":"龙湖·春森彼岸","tag":"房地产;住宅区","addr":"重庆市江北区北滨一路258号","point":{"x":106.55829312141164,"y":29.576979064412556},"direction":"东北","distance":"279","uid":"9de85c83b09b837888f15d98"}},{"addr":"重庆市江北区长新路与刘家台正街交叉路口往西北约160米","cp":" ","direction":"东","distance":"108","name":"龙湖·春森彼岸-8号车库","poiType":"交通设施","point":{"x":106.55962261356597,"y":29.578141267978038},"tag":"交通设施;停车场","tel":"","uid":"63b440710aa3128fadee7a4a","zip":"","parent_poi":{"name":"龙湖·春森彼岸","tag":"房地产;住宅区","addr":"重庆市江北区北滨一路258号","point":{"x":106.55829312141164,"y":29.576979064412556},"direction":"东北","distance":"279","uid":"9de85c83b09b837888f15d98"}},{"addr":"重庆市江北区长兴路2号","cp":" ","direction":"东","distance":"113","name":"龙湖·春森彼岸-北门","poiType":"出入口","point":{"x":106.55957769829048,"y":29.578149120658855},"tag":"出入口;门","tel":"","uid":"4eabe73ddd66fc9a3a13fd89","zip":"","parent_poi":{"name":"龙湖·春森彼岸","tag":"房地产;住宅区","addr":"重庆市江北区北滨一路258号","point":{"x":106.55829312141164,"y":29.576979064412556},"direction":"东北","distance":"279","uid":"9de85c83b09b837888f15d98"}},{"addr":"北滨路陈家馆A宗E宗","cp":" ","direction":"东","distance":"154","name":"龙湖春森彼岸2-8栋","poiType":"房地产","point":{"x":106.55917346081113,"y":29.577944950757006},"tag":"房地产;内部楼栋","tel":"","uid":"1e3f326e817ccb9fc909f036","zip":"","parent_poi":{"name":"龙湖·春森彼岸","tag":"房地产;住宅区","addr":"重庆市江北区北滨一路258号","point":{"x":106.55829312141164,"y":29.576979064412556},"direction":"东北","distance":"279","uid":"9de85c83b09b837888f15d98"}},{"addr":"重庆市江北区长新路与刘家台正街交叉路口往西约120米","cp":" ","direction":"东北","distance":"197","name":"龙湖星悦荟8区(龙湖星悦荟店)-停车库","poiType":"交通设施","point":{"x":106.55942498635383,"y":29.57672777537788},"tag":"交通设施;停车场","tel":"","uid":"868395aeca61ca754251c078","zip":"","parent_poi":{"name":"龙湖星悦荟8区(龙湖星悦荟店)","tag":"购物;购物中心".....
I want to extract :
"tag":"房地产;住宅"
"tag":"教育培训;幼儿园"
"tag":"房地产;内部楼栋"
"tag":"交通设施;停车场"
"tag":"购物;购物中心"
I have tried :
startString = '"tag":'
endString = ',"tel"'''
mySubString.append(result[result.find(startString)+len(startString):result.find(endString)])
but couldn't iterate over the whole text.
I hope, this works for your solution. I used regex to extract these tags
import re
extract_str = [
'"tag":"房地产;住宅"', '"tag":"教育培训;幼儿园"', '"tag":"房地产;内部楼栋"',
'"tag":"交通设施;停车场"', '"tag":"购物;购物中心"'
]
with open('./extract_specific_from_text.txt', 'r', encoding="utf8") as f:
file_txt = f.read()
for i in extract_str:
tags = re.findall(i, file_txt)
if len(tags) > 0:
print(f'{list(set(tags))[0]} found in the file')
else:
print(f'{i} not found in the file')
You can try something like:
import json
data = json.loads(data)
print([el['tag'] for el in data["pois"]])
Which will print all the tag in the json.
# Output
['房地产;住宅区', '教育培训;幼儿园', '房地产;内部楼栋', '交通设施;停车场', '出入口;门', '房地产;内部楼栋', '交通设施;停车场']
I would like load data which are 10 categories of document, each cateory contains text files, but I keep getting the following error:
IndexError: list index out of range
THis is code :
def load_data(folder):
data = []
files = [join(folder, x) for x in os.listdir(folder)]
for file in files:
topic = file.split("/")[9] # this is where the error occurs
label = topic.replace(" ", "_")
name = "__label__" + label
with open(file, "rb") as f:
content = f.read()
content = content.decode('utf-16')
content = " ".join(i for i in content.split())
data.append(name + " " + content)
return data
Easy way to debug this would be to add print statements and check what the objects hold. For e.g. in this case, you can add 2 print statements at the beginning of the for loop. This would help you to figure out why you are getting IndexError
def load_data(folder):
data = []
files = [join(folder, x) for x in os.listdir(folder)]
for file in files:
print(file)
print(file.split("/"))
topic = file.split("/")[9] # this is where the error occurs
label = topic.replace(" ", "_")
name = "__label__" + label
with open(file, "rb") as f:
content = f.read()
content = content.decode('utf-16')
content = " ".join(i for i in content.split())
data.append(name + " " + content)
return data
I'm looping over a csv of links, visiting those links, and then trying to write information from those links to a new file:
with open("hrefs.csv", "rb") as f:
reader = csv.reader(f)
for row in reader:
newUrl = row[0]
response = requests.get(newUrl)
newData = response.text
newSoup = BeautifulSoup(newData, 'lxml')
newstring = ''
titles = newSoup.findAll('span', {'id': 'titletextonly'})
prices = newSoup.findAll('span', {'class': 'price'})
newstring += titles[0].text + ',' + prices[0].text + ','
for ana in newSoup.findAll('p',{'class':'attrgroup'}):
for myb in ana.findAll('b'):
newstring += myb.text + ','
print newstring
listFile = open("output.csv", 'wb')
writer = csv.writer(listFile)
writer.writerow(newstring.encode('ascii', 'ignore').decode('ascii'))
There are a couple problems I'm running into. First, I thought the csv would realize that there are comma separated values and put each attribute in a new column. Second, it seems that one letter is getting put in each column. When I simple print each newstring it is giving me a coherent string.
You need to give writer.writerow a sequence of strings:
writer.writerow(newstring.split(","))
would be the easiest change from what you currently have.
Im writing a script where one of its functions is to read a CSV file that contain URLs on one of its rows. Unfortunately the system that create those CSVs doesn't put double-quotes on values inside the URL column so when the URL contain commas it breaks all my csv parsing.
This is the code I'm using:
with open(accesslog, 'r') as csvfile, open ('results.csv', 'w') as enhancedcsv:
reader = csv.DictReader(csvfile)
for row in reader:
self.uri = (row['URL'])
self.OriCat = (row['Category'])
self.query(self.uri)
print self.URL+","+self.ServerIP+","+self.OriCat+","+self.NewCat"
This is a sample URL that is breaking up the parsing - this URL comes on the row named "URL". (note the commas at the end)
ams1-ib.adnxs.com/ww=1238&wh=705&ft=2&sv=43&tv=view5-1&ua=chrome&pl=mac&x=1468251839064740641,439999,v,mac,webkit_chrome,view5-1,0,,2,
The following row after the URL always come with a numeric value between parenthesis. Ex: (9999) so this could be used to define when the URL with commas end.
How can i deal with a situation like this using the csv module?
You will have to do it a little more manually. Try this
def process(lines, delimiter=','):
header = None
url_index_from_start = None
url_index_from_end = None
for line in lines:
if not header:
header = [l.strip() for l in line.split(delimiter)]
url_index_from_start = header.index('URL')
url_index_from_end = len(header)-url_index_from_start
else:
data = [l.strip() for l in line.split(delimiter)]
url_from_start = url_index_from_start
url_from_end = len(data)-url_index_from_end
values = data[:url_from_start] + data[url_from_end+1:] + [delimiter.join(data[url_from_start:url_from_end+1])]
keys = header[:url_index_from_start] + header[url_index_from_end+1:] + [header[url_index_from_start]]
yield dict(zip(keys, values))
Usage:
lines = ['Header1, Header2, URL, Header3',
'Content1, "Content2", abc,abc,,abc, Content3']
result = list(process(lines))
assert result[0]['Header1'] == 'Content1'
assert result[0]['Header2'] == '"Content2"'
assert result[0]['Header3'] == 'Content3'
assert result[0]['URL'] == 'abc,abc,,abc'
print(result)
Result:
>>> [{'URL': 'abc,abc,,abc', 'Header2': '"Content2"', 'Header3': 'Content3', 'Header1': 'Content1'}]
Have you considered using Pandas to read your data in?
Another possible solution would be to use regular expressions to pre-process the data...
#make a list of everything you want to change
old = re.findall(regex, f.read())
#append quotes and create a new list
new = []
for url in old:
url2 = "\""+url+"\""
new.append(url2)
#combine the lists
old_new = list(zip(old,new))
#Then use the list to update the file:
f = open(filein,'r')
filedata = f.read()
f.close()
for old,new in old_new:
newdata = filedata.replace(old,new)
f = open(filein,'w')
f.write(newdata)
f.close()
I have a file that consists of a single line:
a,x,b,c,d,e
I want to convert this into
a,x,b,x,c,x,d,x,e,x
Is there any easy way to achieve this with python?
my_file = open(filename)
data = my_file.read()
data = data.split(',')
str = ''
for each in data:
if each != 'x':
str += each + ',' + 'x' + ','
str= str.strip(',')
print str
import re
s = open(filename).read()
open(filename, 'w').write(',x,'.join(re.findall(r'[a-wyz]', s)) + ',x\n')