Can't write in csv file - python

When I try to write the information in the csv file, error is thrown:
Traceback (most recent call last):
File "sizeer.py", line 68, in <module>
writer.writerow([name,color,price])
ValueError: I/O operation on closed file
import requests
import csv
from bs4 import BeautifulSoup
proxies = {
"http":"http://195.189.60.97:3128",
"http":"http://103.78.75.165:8080",
"http":"http://212.87.220.2:3128",
"http":"http://88.99.134.61:8080",
"http":"http://103.102.139.178:8080",
"http":"http://218.60.8.83:3129",
"http":"http://124.121.105.193:8888",
"http":"http://198.237.114.54:8080",
"http":"http://36.67.106.58:8080",
"http":"http://35.214.241.28:3128"
}
base_url = ...
page = requests.get(base_url, proxies=proxies)
if page.status_code != 200:
exit("Page wasn't parsed")
soup = BeautifulSoup(page.content, 'lxml')
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
#Get categories
category_wrapper = soup.find_all(class_="m-menu_subItem")
categories = []
for cw in category_wrapper:
anchor = cw.find("a", recursive=False)
categories.append(anchor['href'])
#Iterrate categories
for category in categories:
cat_page = requests.get(base_url + category, proxies=proxies)
cat_soup = BeautifulSoup(cat_page.content, 'lxml')
products_wrapper = cat_soup.find(class_="b-productList")
cat_pagination = products_wrapper.find(class_="m-pagination").find_all("span")
max_page = [int(s) for s in cat_pagination[-1].text.split() if s.isdigit()][0]
#Iterrate category with pagination and get products
for i in range(1, max_page+1):
cat_pagination_page = requests.get(base_url+category+"/?sort=default&limit=60&page="+str(i), proxies=proxies)
cat_pagination_page_soup = BeautifulSoup(cat_pagination_page.content, 'lxml')
product_links = cat_pagination_page_soup.find_all(class_="b-itemList_photoLink")
for link in product_links:
#Get product data
product_page = requests.get(base_url+link['href'], proxies=proxies)
product_soup = BeautifulSoup(product_page.content, 'lxml')
#Get product variations
variations = product_soup.find_all(class_="m-productDescr_colorItem")
#If there are variations
if len(variations) > 0:
for v in variations:
variation_page = requests.get(base_url+v['href'], proxies=proxies)
variation_soup = BeautifulSoup(variation_page.content, 'lxml')
price = variation_soup.find(class_="s-newPrice").text.strip().split(" ")[0]
name = variation_soup.find(class_="m-productDescr_headline").text.strip()
color = v['title']
print(name)
print(color)
print(price)
print("-------------")
#Save in csv
writer.writerow([name,color,price])
print("SCRAPING DONE")
How to keep the file open through the whole script execution ? Or I have to open it every time I am adding content ? EDIT In fact, the file is not even created.

with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
The file closes at the end of the with block - that is the block's purpose.
You could put everything inside the block, but that only makes the existing problem worse: the code is reaching several levels of indents, is long and becomes difficult to understand. This is why you use functions to organize the code. For example, if you have the big for loop set in a function:
def do_stuff_with(categories, writer):
for category in categories:
# lots of logic here
# use `writer.writerow` when needed
# Get everything else set up that doesn't need the file, first
categories = ... # do the BeautifulSoup input stuff
# then we can open the file and use the function:
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
do_stuff_with(categories, writer)
Once you have that working, you can probably think of ways to apply the technique further. For example, pull out the innermost logic, for handling the variations for a single product. Or you can have a function to handle the creation of the categories data, and return it.

Related

CSV file being exported empty and only the headers are showing?

So Im learning more about python everyday. Im doing a mini web scrape project and at the very end when I should see the results on an exported csv - it comes up blank except for the headers. Any help is gladly appreciated! Thanks.
The code is below:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.boxofficemojo.com/year/"
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, "html.parser")
box_office_table = soup.find("div", class_="a-section mojo-body aok-relative").find_all("tr")
with open('imdbmovies.csv', 'a', newline='') as csvfile:
writer = csv.writer(csvfile)
# Write headers to CSV file
writer.writerow(['numone_release', 'year', 'total_gross', 'releases', 'average', 'gross_change'])
for row in box_office_table:
try:
year_cell = row.find("td", class_="a-text-left mojo-header-column mojo-field-type-year mojo-sort-column")
money_cells = row.find_all("td", class_="a-text-right mojo-field-type-money")
releases_cell = row.find("td", class_="a-text-right mojo-field-type-positive_integer")
gross_change_cell = row.find("td", class_="a-text-right mojo-number-delta mojo-field-type-percent_delta")
numone_release_cell = row.find("td", class_="a-text-left mojo-field-type-release mojo-cell-wide")
if len(money_cells) >= 2 and year_cell is not None and releases_cell is not None and gross_change_cell is not None and numone_release_cell is not None:
total_gross_cell = money_cells[0]
average_cell = money_cells[1]
year = year_cell.text.strip()
total_gross = total_gross_cell.text.strip()
releases = releases_cell.text.strip()
average = average_cell.text.strip()
gross_change = gross_change_cell.text.strip()
numone_release = numone_release_cell.text.strip()
print(year, total_gross, releases, average, gross_change, numone_release)
# Write the row to the CSV file
writer.writerow([numone_release, year, total_gross, releases, average, gross_change])
except AttributeError:
# Either a cell is not found
pass

Cant print items in CSV

When I open sdata.csv file it will not iterate, no error is shown simply not printing. Why could this be? I even did print(g) and it shows its reading properly. I also am trying to write data to the same file and the same blank file occurs with only the heading in it.
import urllib.request as request
import json
from urllib.request import urlopen, Request
import requests
import demjson
import csv
import time
req = Request('https://api.gameslabs.net/1.0.0/exchange', headers={'User-Agent': 'Mozilla/5.0'})
with request.urlopen(req) as response:
if response.getcode() == 200:
source = response.read()
data = json.loads(source)
else:
print('An error occurred while attempting to retrieve data from the API.')
y = json.dumps(data)
x = json.loads(y)
f = csv.writer(open("item-com.csv", "w+"))
# Write CSV Header, If you dont need that, remove this line
f.writerow(["symbol", "buy_game", "buy_item", "buy_name", "sell_game", "sell_item", "sell_name"])
for x in x:
f.writerow([x["symbol"],
x["buy"]["game"],
x["buy"]["item"],
x["buy"]["name"],
x["sell"]["game"],
x["sell"]["item"],
x["sell"]["name"]])
o = csv.DictReader(open("item-com.csv"))
for row in o:
print(row['buy_name'])
req2 = Request('https://api.gameslabs.net/1.0.0/exchange/symbol/MS_IRON_PICKAXE/candles?timeFrame=day',
headers={'User-Agent': 'Mozilla/5.0'})
with request.urlopen(req2) as response:
if response.getcode() == 200:
source2 = response.read()
data2 = json.loads(source2)
else:
print('An error occurred while attempting to retrieve data from the API.')
xdum = json.dumps(data2)
bdum = json.loads(xdum)
ged = csv.writer(open("sdata.csv", "w+"))
ged.writerow(["timestamp", "low", "open", "close", "high", "volume"])
for bdum in bdum:
ged.writerow([bdum["timestamp"],
bdum["low"],
bdum["open"],
bdum["close"],
bdum["high"]])
g = csv.DictReader(open("sdata.csv"))
for row in g:
print(row['timestamp'])
You are writing and reading from the same files. However, you don't ensure the file is closed in between. If you use a context manager it will take care of that for you. I notice you are using context managers for url respones.
I've modified your slightly code to use context managers for file management:
...
with open("item-com.csv", "w+") as csv_file:
f = csv.writer(csv_file)
# Write CSV Header, If you dont need that, remove this line
f.writerow(["symbol", "buy_game", "buy_item", "buy_name", "sell_game", "sell_item", "sell_name"])
for x in x:
f.writerow([x["symbol"],
x["buy"]["game"],
x["buy"]["item"],
x["buy"]["name"],
x["sell"]["game"],
x["sell"]["item"],
x["sell"]["name"]])
with open("item-com.csv") as csv_file:
o = csv.DictReader(csv_file)
for row in o:
print(row['buy_name'])
req2 = Request('https://api.gameslabs.net/1.0.0/exchange/symbol/MS_IRON_PICKAXE/candles?timeFrame=day',
headers={'User-Agent': 'Mozilla/5.0'})
with request.urlopen(req2) as response:
if response.getcode() == 200:
source2 = response.read()
data2 = json.loads(source2)
else:
print('An error occurred while attempting to retrieve data from the API.')
xdum = json.dumps(data2)
bdum = json.loads(xdum)
with open("sdata.csv", "w+") as csv_file:
ged = csv.writer(csv_file)
ged.writerow(["timestamp", "low", "open", "close", "high", "volume"])
for bdum in bdum:
ged.writerow([bdum["timestamp"],
bdum["low"],
bdum["open"],
bdum["close"],
bdum["high"]])
with open("sdata.csv") as csv_file:
g = csv.DictReader(csv_file)
for row in g:
print(row['timestamp'])
Instead of writing line by line to text file try this way. This method reduces repetitive i/o and doesn't have to keep the file open for long time.
lst = []
for x in x:
tmpTuple = ([x["symbol"],
x["buy"]["game"],
x["buy"]["item"],
x["buy"]["name"],
x["sell"]["game"],
x["sell"]["item"],
x["sell"]["name"]])
lst.append(tmpTuple)
#outside loop create a pandas dataframe
df = pd.DataFrame(lst)
#this is several options to save
df.to_csv('filename.csv')

Iteration Url Load from CSV in Python

Please Help me
I have a data url in the CSV file, in that file there are 100 rows and 1 column,
I want to load data line 1 to line 100 from CSV using Python, how do I write the code line?
However, after running the repetition can only work once in one of the lines does not reach the end of the url in the CSV and does not continue to the next URL.
disc_information = html.find('div', class_='alert alert-info global-promo').text.strip().strip('\n')
AttributeError: 'NoneType' object has no attribute 'text'
how do I get through if an error occurs when html is not found?
the following line of code I use python, please help so that the looping scrape runs to the end of the url list
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv
import pandas
with open('Url Torch.csv','rt') as f:
data = csv.reader(f, delimiter=',')
for row in data:
URL_GO = row[2]
def variable_Scrape(url):
try:
cookies = dict(cookie="............")
request = requests.get(url, cookies=cookies)
html = BeautifulSoup(request.content, 'html.parser')
title = html.find('div', class_='title').text.strip().strip('\n')
desc = html.find('div', class_='content').text
link = html.find_all('img', class_='lazyload slide-item owl-lazy')
normal_price = html.find('div', class_='amount public').text.strip().strip('\n')
disc_information = html.find('div', class_='alert alert-info global-promo').text.strip().strip('\n')
except AttributeError as e:
print(e)
#ConnectionAbortedError
return False
else:
print(title)
#print(desc)
#print(link)
finally:
print(title)
print(desc)
print(link)
print('Finally.....')
variable_Scrape(URL_GO)
Is hard to give you the exact answer without seeing you csv file but try this:
import csv
f = open('you_file.csv')
csv_f = csv.reader(f)
for row in csv_f:
print row[0]
This is the code
import csv
data = [] #create an empty list to store rows on it
with open('emails.csv') as csv_file:
reader = csv.reader(csv_file)
for row in reader:
data.append(row) #add each row to the list
Based on your comments about passing a loop when the url is not ok:
for url in data: # data is the list where url stored
try:
# do your code here (requests, beautifulsoup) :
# r = requests.get(url) ...
except:
pass
# will go to the next loop (next url) if an error happens

How do I creat CSV file with webscraped content from several URLs?

I want to create a CSV file from webscraped content. The content is from FinViz.com
I want to scrape the table from this website 20 times for 20 different stocks and input all the content into a CSV file. Within my code, I generate a list of stocks from a scrape of twitter content. The list of stocks that is generated is the same list that I want to get information on from the FinViz.com tables.
Here is my code:
import csv
import urllib.request
from bs4 import BeautifulSoup
twiturl = "https://twitter.com/ACInvestorBlog"
twitpage = urllib.request.urlopen(twiturl)
soup = BeautifulSoup(twitpage,"html.parser")
print(soup.title.text)
tweets = [i.text for i in soup.select('a.twitter-cashtag.pretty-link.js-nav b')]
print(tweets)
url_base = "https://finviz.com/quote.ashx?t="
url_list = [url_base + tckr for tckr in tweets]
for url in url_list:
fpage = urllib.request.urlopen(url)
fsoup = BeautifulSoup(fpage, 'html.parser')
# scrape single page and add data to list
# write datalist
with open('today.csv', 'a') as file:
writer = csv.writer(file)
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
The trouble that I am running into is that my CSV file only has the webscraped data from the last item in the list. Instead I want the entire list in a sequence of rows.
Here is what my CSV file looks like:
Index,P/E,EPS (ttm),Insider Own,Shs Outstand,Perf Week,Market Cap,Forward P/E,EPS next Y,Insider Trans,Shs Float,Perf Month,Income,PEG,EPS next Q,Inst Own,Short Float,Perf Quarter,Sales,P/S,EPS this Y,Inst Trans,Short Ratio,Perf Half Y,Book/sh,P/B,EPS next Y,ROA,Target Price,Perf Year,Cash/sh,P/C,EPS next 5Y,ROE,52W Range,Perf YTD,Dividend,P/FCF,EPS past 5Y,ROI,52W High,Beta,Dividend %,Quick Ratio,Sales past 5Y,Gross Margin,52W Low,ATR,Employees,Current Ratio,Sales Q/Q,Oper. Margin,RSI (14),Volatility,Optionable,Debt/Eq,EPS Q/Q,Profit Margin,Rel Volume,Prev Close,Shortable,LT Debt/Eq,Earnings,Payout,Avg Volume,Price,Recom,SMA20,SMA50,SMA200,Volume,Change
-,-,-1.75,7.94%,79.06M,-22.52%,296.48M,-,-1.74,-4.61%,72.41M,-23.16%,-85.70M,-,-0.36,62.00%,3.21%,1.63%,15.10M,19.63,-197.00%,18.05%,2.57,66.67%,-0.65,-,-8.10%,-127.70%,12.17,-6.25%,0.93,4.03,-,146.70%,2.05 - 5.86,3.59%,-,-,-,385.80%,-36.01%,-,-,1.30,-,76.50%,82.93%,0.41,100,1.30,-59.60%,-,36.98,16.13% 9.32%,Yes,-,90.00%,-,0.82,3.63,Yes,-,Nov 08,-,902.43K,3.75,2.30,-22.08%,-10.43%,11.96%,"742,414",3.31%
It would be better to open your output file first, rather than keep on opening/closing it for each URL that you fetch. Exception handling is needed to catch cases where the URL does not exist.
Also on your output, you should open the file with newline='' to avoid extra empty lines being written to the file:
import csv
import urllib.request
from bs4 import BeautifulSoup
write_header = True
twiturl = "https://twitter.com/ACInvestorBlog"
twitpage = urllib.request.urlopen(twiturl)
soup = BeautifulSoup(twitpage,"html.parser")
print(soup.title.text)
tweets = [i.text for i in soup.select('a.twitter-cashtag.pretty-link.js-nav b')]
print(tweets)
url_base = "https://finviz.com/quote.ashx?t="
url_list = [url_base + tckr for tckr in tweets]
with open('today.csv', 'w', newline='') as file:
writer = csv.writer(file)
for url in url_list:
try:
fpage = urllib.request.urlopen(url)
fsoup = BeautifulSoup(fpage, 'html.parser')
# write header row (once)
if write_header:
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
write_header = False
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except urllib.error.HTTPError:
print("{} - not found".format(url))
So today.csv would start like:
Index,P/E,EPS (ttm),Insider Own,Shs Outstand,Perf Week,Market Cap,Forward P/E,EPS next Y,Insider Trans,Shs Float,Perf Month,Income,PEG,EPS next Q,Inst Own,Short Float,Perf Quarter,Sales,P/S,EPS this Y,Inst Trans,Short Ratio,Perf Half Y,Book/sh,P/B,EPS next Y,ROA,Target Price,Perf Year,Cash/sh,P/C,EPS next 5Y,ROE,52W Range,Perf YTD,Dividend,P/FCF,EPS past 5Y,ROI,52W High,Beta,Dividend %,Quick Ratio,Sales past 5Y,Gross Margin,52W Low,ATR,Employees,Current Ratio,Sales Q/Q,Oper. Margin,RSI (14),Volatility,Optionable,Debt/Eq,EPS Q/Q,Profit Margin,Rel Volume,Prev Close,Shortable,LT Debt/Eq,Earnings,Payout,Avg Volume,Price,Recom,SMA20,SMA50,SMA200,Volume,Change
-,-,-10.85,4.60%,2.36M,11.00%,8.09M,-,-,-62.38%,1.95M,-16.14%,-14.90M,-,-,2.30%,10.00%,-44.42%,0.00M,-,21.80%,-5.24%,3.10,-38.16%,1.46,2.35,-,-155.10%,65.00,-50.47%,-,-,-,-238.40%,2.91 - 11.20,-38.29%,-,-,54.50%,-,-69.37%,1.63,-,2.20,-,-,17.87%,0.36,15,2.20,-,-,39.83,11.38% 10.28%,No,0.00,68.70%,-,1.48,3.30,Yes,0.00,Feb 28 AMC,-,62.76K,3.43,1.00,-5.21%,-25.44%,-37.33%,"93,166",3.94%
-,-,-0.26,1.50%,268.98M,3.72%,2.25B,38.05,0.22,-0.64%,263.68M,-9.12%,-55.50M,-,0.05,-,9.96%,-12.26%,1.06B,2.12,-328.10%,25.95%,2.32,17.72%,12.61,0.66,650.00%,-0.90%,12.64,-38.73%,0.03,264.87,-,-1.90%,6.69 - 15.27,-0.48%,-,-,-28.70%,0.00%,-45.17%,2.20,-,0.70,16.40%,67.80%,25.11%,0.41,477,0.80,71.90%,5.30%,52.71,4.83% 5.00%,Yes,0.80,7.80%,-5.20%,0.96,7.78,Yes,0.80,Feb 27 AMC,-,11.31M,8.37,2.20,0.99%,-1.63%,-4.72%,"10,843,026",7.58%
If you only want your file to contain data from one run of the script, you do not need a to append, just use w instead.

Writing scraped data to csv

I've been stuck trying to transfer the data that I scraped to a csv file. Here is my code:
import requests, bs4, csv, sys
reload(sys)
sys.setdefaultencoding('utf-8')
url = 'http://www.constructeursdefrance.com/resultat/?dpt=01'
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text,'html.parser')
links = []
for div in soup.select('.link'):
link = div.a.get('href')
links.append(link)
for i in links:
url2 = i
res2 = requests.get(url2)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
data = each.text, each.next_sibling
with open('french.csv', 'wb') as file:
writer = csv.writer(file)
writer.writerows(data)
the output says:
Traceback (most recent call last):
File "test_new_project.py", line 23, in <module>
writer.writerows(data)
csv.Error: sequence expected
But I am trying to put tuples into the csv file, and as long as I know csv accepts tuples and lists. How can I fix this problem?
Atirag is correct, but you have another issue which is that your with call to open the output file is nested within a for loop. So if there is more than one link, the file will be overwritten each time and the output will not be what you expect. I think this should generate the output you intend:
for div in soup.select('.link'):
link = div.a.get('href')
links.append(link)
with open("french.csv", "w") as file:
writer = csv.writer(file)
for i in links:
res2 = requests.get(i)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
writer.writerow([each.text, each.next_sibling])
Change this
for each in soup2.select('li > strong'):
data = each.text, each.next_sibling
to this
data=[]
for each in soup2.select('li > strong'):
data.append((each.text, each.next_sibling))
your data variable was one tuple not a list of tuples. The above code creates a list of tuples.
Other solution is this (mind the indentation)
data = []
for i in links:
url2 = i
res2 = requests.get(url2)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
data.append((each.text, each.next_sibling))
with open('french.csv', 'wb') as file:
writer = csv.writer(file)
writer.writerows(data)

Categories

Resources