Writing to CSV - set vs list - UnicodeEncodeError - python

I'm building a simple scraper in order to learn python.
After writing the csvWriter function below, I'm having issues. It seems that the encoding can't be written to csv file (I assume this is because of price information I'm scraping).
Also, I'm wondering if I am correct in thinking that in this case, it is best to go from set -> list to get the information zipped and presented in the way that I want before writing.
Also - any general advice on how I am approaching this?
from bs4 import BeautifulSoup
import requests
import time
import csv
response = request.get('http://website.com/subdomain/logqueryhere')
baseurl = 'http://website.com'
soup = BeautifulSoup(response.text)
hotelInfo = soup.find_all("div", {'class': "hotel-wrap"})
#retrieveLinks: A function to generate a list of hotel URL's to be passed to the price checker.
def retrieveLinks():
for hotel in hotelInfo:
urllist = []
hotelLink = hotel.find('a', attrs={'class': ''})
urllist.append(hotelLink['href'])
scraper(urllist)
hotelnameset = set()
hotelurlset = set()
hotelpriceset = set()
# Scraper: A function to scrape from the lists generated above with retrieveLinks
def scraper(inputlist):
global hotelnameset
global hotelurlset
global hotelpriceset
#Use a set here to avoid any dupes.
for url in inputlist:
fullurl = baseurl + url
hotelurlset.add(str(fullurl))
hotelresponse = requests.get(fullurl)
hotelsoup = BeautifulSoup(hotelresponse.text)
hoteltitle = hotelsoup.find('div', attrs={'class': 'vcard'})
hotelhighprice = hotelsoup.find('div', attrs={'class': 'pricing'}).text
hotelpriceset.add(hotelhighprice)
for H1 in hoteltitle:
hotelName = hoteltitle.find('h1').text
hotelnameset.add(str(hotelName))
time.sleep(2)
csvWriter()
#csvWriter: A function to write the above mentioned sets/lists to a CSV file.
def csvWriter():
global hotelnameset
global hotelurlset
global hotelpriceset
csvname = list(hotelnameset)
csvurl = list(hotelurlset)
csvprice = list(hotelpriceset)
#lets zip the values we neded (until we learn a better way to do it)
zipped = zip(csvname, csvurl, csvprice)
c = csv.writer(open("hoteldata.csv", 'wb'))
for row in zipped:
c.writerow(row)
retrieveLinks()
Error is as follows -
± |Add_CSV_Writer U:2 ✗| → python main.py
Traceback (most recent call last):
File "main.py", line 62, in <module>
retrieveLinks()
File "main.py", line 18, in retrieveLinks
scraper(urllist)
File "main.py", line 44, in scraper
csvWriter()
File "main.py", line 60, in csvWriter
c.writerow(row)
UnicodeEncodeError: 'ascii' codec can't encode character u'\u20ac' in position 0: ordinal not in range(128)

Posting your actual error will really help! In any case, in python 2.X the CSV writer does not automatically encode unicode for you. You essentially have to write your own using unicodecsv (https://pypi.python.org/pypi/unicodecsv/0.9.0) or use one of the unicode CSV implementations on the web (1):
import unicodecsv
def csvWriter():
global hotelnameset
global hotelurlset
global hotelpriceset
csvname = list(hotelnameset)
csvurl = list(hotelurlset)
csvprice = list(hotelpriceset)
#lets zip the values we neded (until we learn a better way to do it)
zipped = zip(csvname, csvurl, csvprice)
with open('hoteldata.csv', 'wb') as f_in:
c = unicodecsv.writer(f_in, encoding='utf-8')
for row in zipped:
c.writerow(row)

Related

Can't write in csv file

When I try to write the information in the csv file, error is thrown:
Traceback (most recent call last):
File "sizeer.py", line 68, in <module>
writer.writerow([name,color,price])
ValueError: I/O operation on closed file
import requests
import csv
from bs4 import BeautifulSoup
proxies = {
"http":"http://195.189.60.97:3128",
"http":"http://103.78.75.165:8080",
"http":"http://212.87.220.2:3128",
"http":"http://88.99.134.61:8080",
"http":"http://103.102.139.178:8080",
"http":"http://218.60.8.83:3129",
"http":"http://124.121.105.193:8888",
"http":"http://198.237.114.54:8080",
"http":"http://36.67.106.58:8080",
"http":"http://35.214.241.28:3128"
}
base_url = ...
page = requests.get(base_url, proxies=proxies)
if page.status_code != 200:
exit("Page wasn't parsed")
soup = BeautifulSoup(page.content, 'lxml')
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
#Get categories
category_wrapper = soup.find_all(class_="m-menu_subItem")
categories = []
for cw in category_wrapper:
anchor = cw.find("a", recursive=False)
categories.append(anchor['href'])
#Iterrate categories
for category in categories:
cat_page = requests.get(base_url + category, proxies=proxies)
cat_soup = BeautifulSoup(cat_page.content, 'lxml')
products_wrapper = cat_soup.find(class_="b-productList")
cat_pagination = products_wrapper.find(class_="m-pagination").find_all("span")
max_page = [int(s) for s in cat_pagination[-1].text.split() if s.isdigit()][0]
#Iterrate category with pagination and get products
for i in range(1, max_page+1):
cat_pagination_page = requests.get(base_url+category+"/?sort=default&limit=60&page="+str(i), proxies=proxies)
cat_pagination_page_soup = BeautifulSoup(cat_pagination_page.content, 'lxml')
product_links = cat_pagination_page_soup.find_all(class_="b-itemList_photoLink")
for link in product_links:
#Get product data
product_page = requests.get(base_url+link['href'], proxies=proxies)
product_soup = BeautifulSoup(product_page.content, 'lxml')
#Get product variations
variations = product_soup.find_all(class_="m-productDescr_colorItem")
#If there are variations
if len(variations) > 0:
for v in variations:
variation_page = requests.get(base_url+v['href'], proxies=proxies)
variation_soup = BeautifulSoup(variation_page.content, 'lxml')
price = variation_soup.find(class_="s-newPrice").text.strip().split(" ")[0]
name = variation_soup.find(class_="m-productDescr_headline").text.strip()
color = v['title']
print(name)
print(color)
print(price)
print("-------------")
#Save in csv
writer.writerow([name,color,price])
print("SCRAPING DONE")
How to keep the file open through the whole script execution ? Or I have to open it every time I am adding content ? EDIT In fact, the file is not even created.
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
The file closes at the end of the with block - that is the block's purpose.
You could put everything inside the block, but that only makes the existing problem worse: the code is reaching several levels of indents, is long and becomes difficult to understand. This is why you use functions to organize the code. For example, if you have the big for loop set in a function:
def do_stuff_with(categories, writer):
for category in categories:
# lots of logic here
# use `writer.writerow` when needed
# Get everything else set up that doesn't need the file, first
categories = ... # do the BeautifulSoup input stuff
# then we can open the file and use the function:
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
do_stuff_with(categories, writer)
Once you have that working, you can probably think of ways to apply the technique further. For example, pull out the innermost logic, for handling the variations for a single product. Or you can have a function to handle the creation of the categories data, and return it.

Issue while extracting data using Beautifulsoup

My objective is to convert an xls file to xlsx file. The xls file which I am trying to convert is actually an html file containing tables (This xls file is obtained as a result of a query from jira). To facilitate the conversion I have created a file handler and then given that file handler to a beautiful soup and have extracted the table on interest and this extracted table is converted to a string and given to pandas dataframe for further processing.
This works fine but when the file size is large say around 80 MB it takes a large amount of time to process. How do I overcome this?
import bs4, os
import pandas as pd
print('Begin')
fileName = 'TestSample.xls'
fileHandler=open(fileName, encoding='utf-8')
soup = bs4.BeautifulSoup(fileHandler,'html.parser')
tbl = soup.find_all('table', id='issuetable')
df=pd.read_html(str(tbl))
df[0].to_excel("restult.xlsx", index=False)
print('Completed')
There is no good way for large files, but you can try different ways.
from simplified_scrapy import SimplifiedDoc
print('Begin')
fileName = 'TestSample.xls'
html=open(fileName, encoding='utf-8').read()
doc = SimplifiedDoc(html)
start = 0 # If a string can uniquely mark the starting position of data, the performance will be better
tbl = doc.getElement('table', attr='id',value='issuetable', start=start)
print(tbl.outerHtml)
Or block read
f=open(fileName, encoding='utf-8')
html = ''
start = '' # Start of block
end = '' # End of block
for line in f.readlines():
if not html:
html+=line
if line.find(end)>=0:
break
elif line.find(start)>=0:
html = line
if line.find(end)>=0:
break
doc = SimplifiedDoc(html)
tbl = doc.getElement('table', attr='id',value='issuetable')
print(tbl.outerHtml)

Python code scraping data from kickstarter does not work after some iteration

I try to scrape data from kickstarter, the code is working but it gives the following error in page 15 (you might get get error in different page since webpage is dynamic):
Traceback (most recent call last): File "C:\Users\lenovo\kick.py",
line 30, in
csvwriter.writerow(row) File "C:\Users\lenovo\AppData\Local\Programs\Python\Python37\lib\encodings\cp1252.py",
line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0] UnicodeEncodeError: 'charmap' codec can't encode character '\uff5c' in
position 27: character maps to
What might be the issue? Any suggestion?
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
import csv
KICKSTARTER_SEARCH_URL = "https://www.kickstarter.com/discover/advanced?category_id=16&sort=newest&seed=2502593&page={}"
DATA_FILE = "kickstarter.csv"
csvfile = open(DATA_FILE, 'w')
csvwriter = csv.writer(csvfile, delimiter=',')
page_start = 0
while True:
url = KICKSTARTER_SEARCH_URL.format(page_start)
print(url)
response = urlopen(url)
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
project_details_divs = soup.findAll('div', {"class":"js-react-proj-card"})
if len(project_details_divs) == 0:
break;
for div in project_details_divs:
project = json.loads(div['data-project'])
row = [project["id"],project["name"],project["goal"],project["pledged"]]
csvwriter.writerow(row)
page_start +=1
csvfile.close()
Add the argument encoding to your file-opener. I mean, change
csvfile = open(DATA_FILE, 'w')
into
csvfile = open(DATA_FILE, 'w', encoding='utf-8')
But the practice on that matter is rather to use a context manager
with open(DATA_FILE, 'w', encoding='utf-8') as csvfile:
# ...

beautifulsoup to csv: putting paragraph of text into one line

I have a bunch of web text that I'd like to scrape and export to a csv file. The problem is that the text is split over multiple lines on the website and that's how beautifulsoup reads it. When I export to csv, all the text goes into one cell but the cell has multiple lines of text. When I try to read the csv into another program, it interprets the multiple lines in a way that yields a nonsensical dataset. The question is, how do I put all the text into a single line after I pull it with beautifulsoup but before I export to csv?
Here's a simple working example demonstrating the problem of multiple lines (in fact, the first few lines in the resulting csv are blank, so at first glance it may look empty):
import csv
import requests
from bs4 import BeautifulSoup
def main():
r = requests.get("https://www.econometricsociety.org/publications/econometrica/2017/03/01/search-yield")
soup = BeautifulSoup(r.text,"html.parser")
with open('Temp.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.writer(f,delimiter=",")
abstract=soup.find("article").text
writer.writerow([abstract])
if __name__ == '__main__':
main()
UPDATE: there have been some good suggestions, but it's still not working. The following code still produces a csv file with line breaks in a cell:
import csv
import requests
from bs4 import BeautifulSoup
with open('Temp.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.writer(f,delimiter=',')
r = requests.get("https://www.econometricsociety.org/publications/econometrica/2017/03/01/search-yield")
soup = BeautifulSoup(r.text,'lxml')
find_article = soup.find('article')
find_2para = find_article.p.find_next_sibling("p")
find_largetxt = find_article.p.find_next_sibling("p").nextSibling
writer.writerow([find_2para,find_largetxt])
Here's another attempt based on a different suggestion. This one also ends up producing a line break in the csv file:
import csv
import requests
from bs4 import BeautifulSoup
def main():
r = requests.get("https://www.econometricsociety.org/publications/econometrica/2017/03/01/search-yield")
soup = BeautifulSoup(r.text,"html.parser")
with open('Temp.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.writer(f,delimiter=",")
abstract=soup.find("article").get_text(separator=" ", strip=True)
writer.writerow([abstract])
if __name__ == '__main__':
main()
Change your abstract = ... line into:
abstract = soup.find("article").get_text(separator=" ", strip=True)
It'll separate each line using the separator parameter (in this case It'll separate the strings with an empty space.
The solution that ended up working for me is pretty simple:
abstract=soup.find("article").text.replace("\t", "").replace("\r", "").replace("\n", "")
That gets rid of all line breaks.
r = requests.get("https://www.econometricsociety.org/publications/econometrica/2017/03/01/search-yield")
soup = BeautifulSoup(r.text,'lxml') # I prefer using xml parser
find_article = soup.find('article')
# Next line how to find The title in this case: Econometrica: Mar 2017, Volume 85, Issue 2
find_title = find_article.h3
# find search yeild
find_yeild = find_article.h1
#first_paragraph example : DOI: 10.3982/ECTA14057 p. 351-378
find_1para = find_article.p
#second p example : David Martinez‐Miera, Rafael Repullo
find_2para = find_article.p.find_next_sibling("p")
#find the large text area using e.g. 'We present a model of the relationship bet...'
find_largetxt = find_article.p.find_next_sibling("p").nextSibling
I used a variety of methods of getting to the text area you wish just for the purpose of education(you can use .text on each of these to get the text without tags or you can use Zroq's method.
But you can write each one of these into the file by doing for example
writer.writerow(find_title.text)

How can I update this code using urllib2 to Python3?

My current professor is using Python 2.7 for examples in class, but other professors that I will be taking classes from in the future have suggested I use Python 3.5. I am trying to convert my current Professor's examples from 2.7 to 3.5. Right now I'm having an issue with the urllib2 package, which I understand has been split in Python 3.
The original code in the iPython notebook looks like this :
import csv
import urllib2
data_url = 'http://archive.ics.uci.edu/ml/machine-learning- databases/adult/adult.data'
response = urllib2.urlopen(data_url)
myreader = csv.reader(response)
for i in range(5):
row = next(myreader)
print ','.join(row)
Which I have converted to:
import csv
import urllib.request
data_url = 'http://archive.ics.uci.edu/ml/machine-learning- databases/adult/adult.data'
response = urllib.request.urlopen(data_url)
myreader = csv.reader(response)
for i in range(5):
row = next(myreader)
print(','.join(row))
But that leaves me with the error:
Error Traceback (most recent call last)
<ipython-input-19-20da479e256f> in <module>()
7 myreader = csv.reader(response)
8 for i in range(5):
----> 9 row = next(myreader)
10 print(','.join(row))
Error: iterator should return strings, not bytes (did you open the file in text mode?)
I'm unsure how to proceed from here. Any ideas?
Wrap response with another iterator which decode bytes to string and yield the strings:
import csv
import urllib.request
def decode_iter(it):
# iterate line by line
for line in it:
# convert bytes to string using `bytes.decode`
yield line.decode()
data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
response = urllib.request.urlopen(data_url)
myreader = csv.reader(decode_iter(response))
for i in range(5):
row = next(myreader)
print(','.join(row))
UPDATE
Instead of decode_iter, you can use codecs.iter_decode:
import csv
import codecs
import urllib.request
data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
response = urllib.request.urlopen(data_url)
myreader = csv.reader(codecs.iterdecode(response, 'utf-8'))
for i in range(5):
row = next(myreader)
print(','.join(row))

Categories

Resources