I'm looping over a csv of links, visiting those links, and then trying to write information from those links to a new file:
with open("hrefs.csv", "rb") as f:
reader = csv.reader(f)
for row in reader:
newUrl = row[0]
response = requests.get(newUrl)
newData = response.text
newSoup = BeautifulSoup(newData, 'lxml')
newstring = ''
titles = newSoup.findAll('span', {'id': 'titletextonly'})
prices = newSoup.findAll('span', {'class': 'price'})
newstring += titles[0].text + ',' + prices[0].text + ','
for ana in newSoup.findAll('p',{'class':'attrgroup'}):
for myb in ana.findAll('b'):
newstring += myb.text + ','
print newstring
listFile = open("output.csv", 'wb')
writer = csv.writer(listFile)
writer.writerow(newstring.encode('ascii', 'ignore').decode('ascii'))
There are a couple problems I'm running into. First, I thought the csv would realize that there are comma separated values and put each attribute in a new column. Second, it seems that one letter is getting put in each column. When I simple print each newstring it is giving me a coherent string.
You need to give writer.writerow a sequence of strings:
writer.writerow(newstring.split(","))
would be the easiest change from what you currently have.
Related
I'm trying to process data(remove hastag, link and #) from CSV files and stored it back to CSV. however the output does not perform well. it is separated with a comma for each character. Can anyone help me to write it to csv in a better way. thank you
import re,string
import csv
def strip_links(text):
link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:##%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
links = re.findall(link_regex, text)
for link in links:
text = text.replace(link[0], ', ')
return text
def strip_all_entities(text):
entity_prefixes = ['#','#']
for separator in string.punctuation:
if separator not in entity_prefixes :
text = text.replace(separator,' ')
words = []
for word in text.split():
word = word.strip()
if word:
if word[0] not in entity_prefixes:
words.append(word)
return ' '.join(words)
f = open('Test.csv')
csf_f = csv.reader(f)
temp =[]
for row in csf_f:
temp.append(row[0])
temp1 = []
for t in temp:
temp1.append(strip_all_entities(strip_links(t)))
for i in range(0, len(temp1)):
with open('MYOUTPUT.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows(temp1)
f.close()
You may think of this one as another redundant question asked, but I tried to go through all similar questions asked, no luck so far. In my specific use-case, I can't use pandas or any other similar library for this operation.
This is what my input looks like
AttributeName,Value
Name,John
Gender,M
PlaceofBirth,Texas
Name,Alexa
Gender,F
SurName,Garden
This is my expected output
Name,Gender,Surname,PlaceofBirth
John,M,,Texas
Alexa,F,Garden,
So far, I have tried to store my input into a dictionary and then tried writing it to a csv string. But, it is failing as I am not sure how to incorporate missing column values conditions. Here is my code so far
reader = csv.reader(csvstring.split('\n'), delimiter=',')
csvdata = {}
csvfile = ''
for row in reader:
if row[0] != '' and row[0] in csvdata and row[1] != '':
csvdata[row[0]].append(row[1])
elif row[0] != '' and row[0] in csvdata and row[1] == '':
csvdata[row[0]].append(' ')
elif row[0] != '' and row[1] != '':
csvdata[row[0]] = [row[1]]
elif row[0] != '' and row[1] == '':
csvdata[row[0]] = [' ']
for key, value in csvdata.items():
if value == ' ':
csvdata[key] = []
csvfile += ','.join(csvdata.keys()) + '\n'
for row in zip(*csvdata.values()):
csvfile += ','.join(row) + '\n'
For the above code as well, I took some help here. Thanks in advance for any suggestions/advice.
Edit #1 : Update code to imply that I am doing processing on a csv string instead of a csv file.
What you need is something like that:
import csv
with open("in.csv") as infile:
buffer = []
item = {}
lines = csv.reader(infile)
for line in lines:
if line[0] == 'Name':
buffer.append(item.copy())
item = {'Name':line[1]}
else:
item[line[0]] = line[1]
buffer.append(item.copy())
for item in buffer[1:]:
print item
If none of the attributes is mandatory, I think #framontb solution needs to be rearranged in order to work also when Name field is not given.
This is an import-free solution, and it's not super elegant.
I assume you have lines already in this form, with this columns:
lines = [
"Name,John",
"Gender,M",
"PlaceofBirth,Texas",
"Gender,F",
"Name,Alexa",
"Surname,Garden" # modified typo here: SurName -> Surname
]
cols = ["Name", "Gender", "Surname", "PlaceofBirth"]
We need to distinguish one record from another, and without mandatory fields the best I can do is start considering a new record when an attribute has already been seen.
To do this, I use a temporary list of attributes tempcols from which I remove elements until an error is raised, i.e. new record.
Code:
csvdata = {k:[] for k in cols}
tempcols = list(cols)
for line in lines:
attr, value = line.split(",")
try:
csvdata[attr].append(value)
tempcols.remove(attr)
except ValueError:
for c in tempcols: # now tempcols has only "missing" attributes
csvdata[c].append("")
tempcols = [c for c in cols if c != attr]
for c in tempcols:
csvdata[c].append("")
# write csv string with the code you provided
csvfile = ""
csvfile += ",".join(csvdata.keys()) + "\n"
for row in zip(*csvdata.values()):
csvfile += ",".join(row) + "\n"
>>> print(csvfile)
Name,PlaceofBirth,Surname,Gender
John,Texas,,M
Alexa,,Garden,F
While, if you want to sort columns according to your desired output:
csvfile = ""
csvfile += ",".join(cols) + "\n"
for row in zip(*[csvdata[k] for k in cols]):
csvfile += ",".join(row) + "\n"
>>> print(csvfile)
Name,Gender,Surname,PlaceofBirth
John,M,,Texas
Alexa,F,Garden,
This works for me:
with open("in.csv") as infile, open("out.csv", "w") as outfile:
incsv, outcsv = csv.reader(infile), csv.writer(outfile)
incsv.__next__() # Skip 1st row
outcsv.writerows(zip(*incsv))
Update: For input and output as strings:
import csv, io
with io.StringIO(indata) as infile, io.StringIO() as outfile:
incsv, outcsv = csv.reader(infile), csv.writer(outfile)
incsv.__next__() # Skip 1st row
outcsv.writerows(zip(*incsv))
print(outfile.getvalue())
I am trying to extract a review from one page in Zomato using request and Beautiful Soup 4 in Python. I want to store the link of the requested page and the review extracted into one csv file.
My problem is that the review I extracted does not store into one cell but instead it splits into multiple cells. How do I store my extracted review into one cell?
Here is my code:
import time
from bs4 import BeautifulSoup
import requests
URL = "https://www.zomato.com/review/eQEygl"
time.sleep(2)
reviewPage = requests.get(URL, headers = {'user-agent': 'my-app/0.0.1'})
reviewSoup = BeautifulSoup(reviewPage.content,"html.parser")
reviewText = reviewSoup.find("div",{"class":"rev-text"})
textSoup = BeautifulSoup(str(reviewText), "html.parser")
reviewElem = [URL, ""]
for string in textSoup.stripped_strings:
reviewElem[1] += string
csv = open("out.csv", "w", encoding="utf-8")
csv.write("Link, Review\n")
row = reviewElem[0] + "," + reviewElem[1] + "\n"
csv.write(row)
csv.close()
Output
Expected Output
I think the problem is the commas embedded in the reviewElem[1] string, because they are the default delimiter in most CSV software. The following avoids the problem by wrapping the contents of the string in " characters to indicate it's all one cell:
import time
from bs4 import BeautifulSoup
import requests
URL = "https://www.zomato.com/review/eQEygl"
time.sleep(2)
reviewPage = requests.get(URL, headers = {'user-agent': 'my-app/0.0.1'})
reviewSoup = BeautifulSoup(reviewPage.content,"html.parser")
reviewText = reviewSoup.find("div",{"class":"rev-text"})
textSoup = BeautifulSoup(str(reviewText), "html.parser")
reviewElem = [URL, ""]
for string in textSoup.stripped_strings:
reviewElem[1] += string
csv = open("out.csv", "w", encoding="utf-8")
csv.write("Link, Review\n")
#row = reviewElem[0] + "," + reviewElem[1] + "\n"
row = reviewElem[0] + ',"{}"\n'.format(reviewElem[1]) # quote string 2
csv.write(row)
csv.close()
There is no need to manually construct a CSV string. When you do it manually, if there are column delimiters (, by default) inside the column values, they are interpreted as delimiters and not literal strings leading to a column value being scattered around multiple columns.
Use the csv module and the .writerow() method:
import csv
# ...
with open("out.csv", "w", encoding="utf-8") as csv_file:
writer = csv.writer(csv_file)
writer.writerow(["Link", "Review"])
writer.writerow(reviewElem)
Im writing a script where one of its functions is to read a CSV file that contain URLs on one of its rows. Unfortunately the system that create those CSVs doesn't put double-quotes on values inside the URL column so when the URL contain commas it breaks all my csv parsing.
This is the code I'm using:
with open(accesslog, 'r') as csvfile, open ('results.csv', 'w') as enhancedcsv:
reader = csv.DictReader(csvfile)
for row in reader:
self.uri = (row['URL'])
self.OriCat = (row['Category'])
self.query(self.uri)
print self.URL+","+self.ServerIP+","+self.OriCat+","+self.NewCat"
This is a sample URL that is breaking up the parsing - this URL comes on the row named "URL". (note the commas at the end)
ams1-ib.adnxs.com/ww=1238&wh=705&ft=2&sv=43&tv=view5-1&ua=chrome&pl=mac&x=1468251839064740641,439999,v,mac,webkit_chrome,view5-1,0,,2,
The following row after the URL always come with a numeric value between parenthesis. Ex: (9999) so this could be used to define when the URL with commas end.
How can i deal with a situation like this using the csv module?
You will have to do it a little more manually. Try this
def process(lines, delimiter=','):
header = None
url_index_from_start = None
url_index_from_end = None
for line in lines:
if not header:
header = [l.strip() for l in line.split(delimiter)]
url_index_from_start = header.index('URL')
url_index_from_end = len(header)-url_index_from_start
else:
data = [l.strip() for l in line.split(delimiter)]
url_from_start = url_index_from_start
url_from_end = len(data)-url_index_from_end
values = data[:url_from_start] + data[url_from_end+1:] + [delimiter.join(data[url_from_start:url_from_end+1])]
keys = header[:url_index_from_start] + header[url_index_from_end+1:] + [header[url_index_from_start]]
yield dict(zip(keys, values))
Usage:
lines = ['Header1, Header2, URL, Header3',
'Content1, "Content2", abc,abc,,abc, Content3']
result = list(process(lines))
assert result[0]['Header1'] == 'Content1'
assert result[0]['Header2'] == '"Content2"'
assert result[0]['Header3'] == 'Content3'
assert result[0]['URL'] == 'abc,abc,,abc'
print(result)
Result:
>>> [{'URL': 'abc,abc,,abc', 'Header2': '"Content2"', 'Header3': 'Content3', 'Header1': 'Content1'}]
Have you considered using Pandas to read your data in?
Another possible solution would be to use regular expressions to pre-process the data...
#make a list of everything you want to change
old = re.findall(regex, f.read())
#append quotes and create a new list
new = []
for url in old:
url2 = "\""+url+"\""
new.append(url2)
#combine the lists
old_new = list(zip(old,new))
#Then use the list to update the file:
f = open(filein,'r')
filedata = f.read()
f.close()
for old,new in old_new:
newdata = filedata.replace(old,new)
f = open(filein,'w')
f.write(newdata)
f.close()
I'm using lxml and etree to parse an html-file. The code looks like this:
def get_all_languages():
allLanguages = "http://wold.livingsources.org/vocabulary"
f = urllib.urlopen(allLanguages).read()
#inFile = "imLog.xml"
#html = f.read()
#f.close()
#encoding = chardet.detect(f)['encoding']
#f.decode(encoding, 'replace').encode('utf-8')
html = etree.HTML(f)
#result = etree.tostring(html, pretty_print=True, method="html")
#print result #is of type string
#print type(result)
return html
Than, I'm extracting some information from the webside and save it in an array.
While appending the string to an array, the encoding or format of the string changes. I think is some kind of unicode object or so??
So I thought it might not be challenging for me, because I remove it from the array and print it in an output file.
def print_file():
#outputfile
output = 'WOLDDictionaries.txt'
dataLanguage = get_data_all_languages()
dataDictionaries = get_site_single_lang()
outputFile = open(output, "w")
outputFile.flush()
for index, array in enumerate(dataLanguage):
indexLang = index
for item in array:
string = item
#indexLang = index
outputFile.write(string + "\t")
outputFile.write("\n")
#outputFile.flush()
for index, array in enumerate(dataDictionaries):
#stringArray = str(array)
indexDic = index
#outputFile.write(index + stringArray + "\t")
if(indexLang == indexDic):
#outputFile.write(string + "\t")
for data in array:
#stringData = str(data)
#outputFile.write(stringData + "\t")
for word in data:
stringWord = word
outputFile.write(stringWord + "\t")
outputFile.write("\n")
#outputFile.flush()
outputFile.close()
Well this thougth was wrong. While printing it to a file, the encoding is still wrong.
What can I do to get the right characters?
Ok I found the problem. It was in the parser.
I changed the encoding in the parser and got the right encoding in the end.
The mothod now looks like this:
myparser = etree.HTMLParser(encoding = 'utf-8')
html = etree.HTML(f, parser=myparser)
#print etree.tostring(xml, pretty_print=True, method="xml")
#print type(xml)
return html