Can't write in csv file - python
When I try to write the information in the csv file, error is thrown:
Traceback (most recent call last):
File "sizeer.py", line 68, in <module>
writer.writerow([name,color,price])
ValueError: I/O operation on closed file
import requests
import csv
from bs4 import BeautifulSoup
proxies = {
"http":"http://195.189.60.97:3128",
"http":"http://103.78.75.165:8080",
"http":"http://212.87.220.2:3128",
"http":"http://88.99.134.61:8080",
"http":"http://103.102.139.178:8080",
"http":"http://218.60.8.83:3129",
"http":"http://124.121.105.193:8888",
"http":"http://198.237.114.54:8080",
"http":"http://36.67.106.58:8080",
"http":"http://35.214.241.28:3128"
}
base_url = ...
page = requests.get(base_url, proxies=proxies)
if page.status_code != 200:
exit("Page wasn't parsed")
soup = BeautifulSoup(page.content, 'lxml')
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
#Get categories
category_wrapper = soup.find_all(class_="m-menu_subItem")
categories = []
for cw in category_wrapper:
anchor = cw.find("a", recursive=False)
categories.append(anchor['href'])
#Iterrate categories
for category in categories:
cat_page = requests.get(base_url + category, proxies=proxies)
cat_soup = BeautifulSoup(cat_page.content, 'lxml')
products_wrapper = cat_soup.find(class_="b-productList")
cat_pagination = products_wrapper.find(class_="m-pagination").find_all("span")
max_page = [int(s) for s in cat_pagination[-1].text.split() if s.isdigit()][0]
#Iterrate category with pagination and get products
for i in range(1, max_page+1):
cat_pagination_page = requests.get(base_url+category+"/?sort=default&limit=60&page="+str(i), proxies=proxies)
cat_pagination_page_soup = BeautifulSoup(cat_pagination_page.content, 'lxml')
product_links = cat_pagination_page_soup.find_all(class_="b-itemList_photoLink")
for link in product_links:
#Get product data
product_page = requests.get(base_url+link['href'], proxies=proxies)
product_soup = BeautifulSoup(product_page.content, 'lxml')
#Get product variations
variations = product_soup.find_all(class_="m-productDescr_colorItem")
#If there are variations
if len(variations) > 0:
for v in variations:
variation_page = requests.get(base_url+v['href'], proxies=proxies)
variation_soup = BeautifulSoup(variation_page.content, 'lxml')
price = variation_soup.find(class_="s-newPrice").text.strip().split(" ")[0]
name = variation_soup.find(class_="m-productDescr_headline").text.strip()
color = v['title']
print(name)
print(color)
print(price)
print("-------------")
#Save in csv
writer.writerow([name,color,price])
print("SCRAPING DONE")
How to keep the file open through the whole script execution ? Or I have to open it every time I am adding content ? EDIT In fact, the file is not even created.
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
The file closes at the end of the with block - that is the block's purpose.
You could put everything inside the block, but that only makes the existing problem worse: the code is reaching several levels of indents, is long and becomes difficult to understand. This is why you use functions to organize the code. For example, if you have the big for loop set in a function:
def do_stuff_with(categories, writer):
for category in categories:
# lots of logic here
# use `writer.writerow` when needed
# Get everything else set up that doesn't need the file, first
categories = ... # do the BeautifulSoup input stuff
# then we can open the file and use the function:
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
do_stuff_with(categories, writer)
Once you have that working, you can probably think of ways to apply the technique further. For example, pull out the innermost logic, for handling the variations for a single product. Or you can have a function to handle the creation of the categories data, and return it.
Related
CSV file being exported empty and only the headers are showing?
So Im learning more about python everyday. Im doing a mini web scrape project and at the very end when I should see the results on an exported csv - it comes up blank except for the headers. Any help is gladly appreciated! Thanks. The code is below: import csv import requests from bs4 import BeautifulSoup url = "https://www.boxofficemojo.com/year/" response = requests.get(url) html = response.content soup = BeautifulSoup(html, "html.parser") box_office_table = soup.find("div", class_="a-section mojo-body aok-relative").find_all("tr") with open('imdbmovies.csv', 'a', newline='') as csvfile: writer = csv.writer(csvfile) # Write headers to CSV file writer.writerow(['numone_release', 'year', 'total_gross', 'releases', 'average', 'gross_change']) for row in box_office_table: try: year_cell = row.find("td", class_="a-text-left mojo-header-column mojo-field-type-year mojo-sort-column") money_cells = row.find_all("td", class_="a-text-right mojo-field-type-money") releases_cell = row.find("td", class_="a-text-right mojo-field-type-positive_integer") gross_change_cell = row.find("td", class_="a-text-right mojo-number-delta mojo-field-type-percent_delta") numone_release_cell = row.find("td", class_="a-text-left mojo-field-type-release mojo-cell-wide") if len(money_cells) >= 2 and year_cell is not None and releases_cell is not None and gross_change_cell is not None and numone_release_cell is not None: total_gross_cell = money_cells[0] average_cell = money_cells[1] year = year_cell.text.strip() total_gross = total_gross_cell.text.strip() releases = releases_cell.text.strip() average = average_cell.text.strip() gross_change = gross_change_cell.text.strip() numone_release = numone_release_cell.text.strip() print(year, total_gross, releases, average, gross_change, numone_release) # Write the row to the CSV file writer.writerow([numone_release, year, total_gross, releases, average, gross_change]) except AttributeError: # Either a cell is not found pass
Cant print items in CSV
When I open sdata.csv file it will not iterate, no error is shown simply not printing. Why could this be? I even did print(g) and it shows its reading properly. I also am trying to write data to the same file and the same blank file occurs with only the heading in it. import urllib.request as request import json from urllib.request import urlopen, Request import requests import demjson import csv import time req = Request('https://api.gameslabs.net/1.0.0/exchange', headers={'User-Agent': 'Mozilla/5.0'}) with request.urlopen(req) as response: if response.getcode() == 200: source = response.read() data = json.loads(source) else: print('An error occurred while attempting to retrieve data from the API.') y = json.dumps(data) x = json.loads(y) f = csv.writer(open("item-com.csv", "w+")) # Write CSV Header, If you dont need that, remove this line f.writerow(["symbol", "buy_game", "buy_item", "buy_name", "sell_game", "sell_item", "sell_name"]) for x in x: f.writerow([x["symbol"], x["buy"]["game"], x["buy"]["item"], x["buy"]["name"], x["sell"]["game"], x["sell"]["item"], x["sell"]["name"]]) o = csv.DictReader(open("item-com.csv")) for row in o: print(row['buy_name']) req2 = Request('https://api.gameslabs.net/1.0.0/exchange/symbol/MS_IRON_PICKAXE/candles?timeFrame=day', headers={'User-Agent': 'Mozilla/5.0'}) with request.urlopen(req2) as response: if response.getcode() == 200: source2 = response.read() data2 = json.loads(source2) else: print('An error occurred while attempting to retrieve data from the API.') xdum = json.dumps(data2) bdum = json.loads(xdum) ged = csv.writer(open("sdata.csv", "w+")) ged.writerow(["timestamp", "low", "open", "close", "high", "volume"]) for bdum in bdum: ged.writerow([bdum["timestamp"], bdum["low"], bdum["open"], bdum["close"], bdum["high"]]) g = csv.DictReader(open("sdata.csv")) for row in g: print(row['timestamp'])
You are writing and reading from the same files. However, you don't ensure the file is closed in between. If you use a context manager it will take care of that for you. I notice you are using context managers for url respones. I've modified your slightly code to use context managers for file management: ... with open("item-com.csv", "w+") as csv_file: f = csv.writer(csv_file) # Write CSV Header, If you dont need that, remove this line f.writerow(["symbol", "buy_game", "buy_item", "buy_name", "sell_game", "sell_item", "sell_name"]) for x in x: f.writerow([x["symbol"], x["buy"]["game"], x["buy"]["item"], x["buy"]["name"], x["sell"]["game"], x["sell"]["item"], x["sell"]["name"]]) with open("item-com.csv") as csv_file: o = csv.DictReader(csv_file) for row in o: print(row['buy_name']) req2 = Request('https://api.gameslabs.net/1.0.0/exchange/symbol/MS_IRON_PICKAXE/candles?timeFrame=day', headers={'User-Agent': 'Mozilla/5.0'}) with request.urlopen(req2) as response: if response.getcode() == 200: source2 = response.read() data2 = json.loads(source2) else: print('An error occurred while attempting to retrieve data from the API.') xdum = json.dumps(data2) bdum = json.loads(xdum) with open("sdata.csv", "w+") as csv_file: ged = csv.writer(csv_file) ged.writerow(["timestamp", "low", "open", "close", "high", "volume"]) for bdum in bdum: ged.writerow([bdum["timestamp"], bdum["low"], bdum["open"], bdum["close"], bdum["high"]]) with open("sdata.csv") as csv_file: g = csv.DictReader(csv_file) for row in g: print(row['timestamp'])
Instead of writing line by line to text file try this way. This method reduces repetitive i/o and doesn't have to keep the file open for long time. lst = [] for x in x: tmpTuple = ([x["symbol"], x["buy"]["game"], x["buy"]["item"], x["buy"]["name"], x["sell"]["game"], x["sell"]["item"], x["sell"]["name"]]) lst.append(tmpTuple) #outside loop create a pandas dataframe df = pd.DataFrame(lst) #this is several options to save df.to_csv('filename.csv')
Iteration Url Load from CSV in Python
Please Help me I have a data url in the CSV file, in that file there are 100 rows and 1 column, I want to load data line 1 to line 100 from CSV using Python, how do I write the code line? However, after running the repetition can only work once in one of the lines does not reach the end of the url in the CSV and does not continue to the next URL. disc_information = html.find('div', class_='alert alert-info global-promo').text.strip().strip('\n') AttributeError: 'NoneType' object has no attribute 'text' how do I get through if an error occurs when html is not found? the following line of code I use python, please help so that the looping scrape runs to the end of the url list from bs4 import BeautifulSoup import requests import pandas as pd import csv import pandas with open('Url Torch.csv','rt') as f: data = csv.reader(f, delimiter=',') for row in data: URL_GO = row[2] def variable_Scrape(url): try: cookies = dict(cookie="............") request = requests.get(url, cookies=cookies) html = BeautifulSoup(request.content, 'html.parser') title = html.find('div', class_='title').text.strip().strip('\n') desc = html.find('div', class_='content').text link = html.find_all('img', class_='lazyload slide-item owl-lazy') normal_price = html.find('div', class_='amount public').text.strip().strip('\n') disc_information = html.find('div', class_='alert alert-info global-promo').text.strip().strip('\n') except AttributeError as e: print(e) #ConnectionAbortedError return False else: print(title) #print(desc) #print(link) finally: print(title) print(desc) print(link) print('Finally.....') variable_Scrape(URL_GO)
Is hard to give you the exact answer without seeing you csv file but try this: import csv f = open('you_file.csv') csv_f = csv.reader(f) for row in csv_f: print row[0]
This is the code import csv data = [] #create an empty list to store rows on it with open('emails.csv') as csv_file: reader = csv.reader(csv_file) for row in reader: data.append(row) #add each row to the list Based on your comments about passing a loop when the url is not ok: for url in data: # data is the list where url stored try: # do your code here (requests, beautifulsoup) : # r = requests.get(url) ... except: pass # will go to the next loop (next url) if an error happens
How do I creat CSV file with webscraped content from several URLs?
I want to create a CSV file from webscraped content. The content is from FinViz.com I want to scrape the table from this website 20 times for 20 different stocks and input all the content into a CSV file. Within my code, I generate a list of stocks from a scrape of twitter content. The list of stocks that is generated is the same list that I want to get information on from the FinViz.com tables. Here is my code: import csv import urllib.request from bs4 import BeautifulSoup twiturl = "https://twitter.com/ACInvestorBlog" twitpage = urllib.request.urlopen(twiturl) soup = BeautifulSoup(twitpage,"html.parser") print(soup.title.text) tweets = [i.text for i in soup.select('a.twitter-cashtag.pretty-link.js-nav b')] print(tweets) url_base = "https://finviz.com/quote.ashx?t=" url_list = [url_base + tckr for tckr in tweets] for url in url_list: fpage = urllib.request.urlopen(url) fsoup = BeautifulSoup(fpage, 'html.parser') # scrape single page and add data to list # write datalist with open('today.csv', 'a') as file: writer = csv.writer(file) # write header row writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'}))) # write body row writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'}))) The trouble that I am running into is that my CSV file only has the webscraped data from the last item in the list. Instead I want the entire list in a sequence of rows. Here is what my CSV file looks like: Index,P/E,EPS (ttm),Insider Own,Shs Outstand,Perf Week,Market Cap,Forward P/E,EPS next Y,Insider Trans,Shs Float,Perf Month,Income,PEG,EPS next Q,Inst Own,Short Float,Perf Quarter,Sales,P/S,EPS this Y,Inst Trans,Short Ratio,Perf Half Y,Book/sh,P/B,EPS next Y,ROA,Target Price,Perf Year,Cash/sh,P/C,EPS next 5Y,ROE,52W Range,Perf YTD,Dividend,P/FCF,EPS past 5Y,ROI,52W High,Beta,Dividend %,Quick Ratio,Sales past 5Y,Gross Margin,52W Low,ATR,Employees,Current Ratio,Sales Q/Q,Oper. Margin,RSI (14),Volatility,Optionable,Debt/Eq,EPS Q/Q,Profit Margin,Rel Volume,Prev Close,Shortable,LT Debt/Eq,Earnings,Payout,Avg Volume,Price,Recom,SMA20,SMA50,SMA200,Volume,Change -,-,-1.75,7.94%,79.06M,-22.52%,296.48M,-,-1.74,-4.61%,72.41M,-23.16%,-85.70M,-,-0.36,62.00%,3.21%,1.63%,15.10M,19.63,-197.00%,18.05%,2.57,66.67%,-0.65,-,-8.10%,-127.70%,12.17,-6.25%,0.93,4.03,-,146.70%,2.05 - 5.86,3.59%,-,-,-,385.80%,-36.01%,-,-,1.30,-,76.50%,82.93%,0.41,100,1.30,-59.60%,-,36.98,16.13% 9.32%,Yes,-,90.00%,-,0.82,3.63,Yes,-,Nov 08,-,902.43K,3.75,2.30,-22.08%,-10.43%,11.96%,"742,414",3.31%
It would be better to open your output file first, rather than keep on opening/closing it for each URL that you fetch. Exception handling is needed to catch cases where the URL does not exist. Also on your output, you should open the file with newline='' to avoid extra empty lines being written to the file: import csv import urllib.request from bs4 import BeautifulSoup write_header = True twiturl = "https://twitter.com/ACInvestorBlog" twitpage = urllib.request.urlopen(twiturl) soup = BeautifulSoup(twitpage,"html.parser") print(soup.title.text) tweets = [i.text for i in soup.select('a.twitter-cashtag.pretty-link.js-nav b')] print(tweets) url_base = "https://finviz.com/quote.ashx?t=" url_list = [url_base + tckr for tckr in tweets] with open('today.csv', 'w', newline='') as file: writer = csv.writer(file) for url in url_list: try: fpage = urllib.request.urlopen(url) fsoup = BeautifulSoup(fpage, 'html.parser') # write header row (once) if write_header: writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'}))) write_header = False # write body row writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'}))) except urllib.error.HTTPError: print("{} - not found".format(url)) So today.csv would start like: Index,P/E,EPS (ttm),Insider Own,Shs Outstand,Perf Week,Market Cap,Forward P/E,EPS next Y,Insider Trans,Shs Float,Perf Month,Income,PEG,EPS next Q,Inst Own,Short Float,Perf Quarter,Sales,P/S,EPS this Y,Inst Trans,Short Ratio,Perf Half Y,Book/sh,P/B,EPS next Y,ROA,Target Price,Perf Year,Cash/sh,P/C,EPS next 5Y,ROE,52W Range,Perf YTD,Dividend,P/FCF,EPS past 5Y,ROI,52W High,Beta,Dividend %,Quick Ratio,Sales past 5Y,Gross Margin,52W Low,ATR,Employees,Current Ratio,Sales Q/Q,Oper. Margin,RSI (14),Volatility,Optionable,Debt/Eq,EPS Q/Q,Profit Margin,Rel Volume,Prev Close,Shortable,LT Debt/Eq,Earnings,Payout,Avg Volume,Price,Recom,SMA20,SMA50,SMA200,Volume,Change -,-,-10.85,4.60%,2.36M,11.00%,8.09M,-,-,-62.38%,1.95M,-16.14%,-14.90M,-,-,2.30%,10.00%,-44.42%,0.00M,-,21.80%,-5.24%,3.10,-38.16%,1.46,2.35,-,-155.10%,65.00,-50.47%,-,-,-,-238.40%,2.91 - 11.20,-38.29%,-,-,54.50%,-,-69.37%,1.63,-,2.20,-,-,17.87%,0.36,15,2.20,-,-,39.83,11.38% 10.28%,No,0.00,68.70%,-,1.48,3.30,Yes,0.00,Feb 28 AMC,-,62.76K,3.43,1.00,-5.21%,-25.44%,-37.33%,"93,166",3.94% -,-,-0.26,1.50%,268.98M,3.72%,2.25B,38.05,0.22,-0.64%,263.68M,-9.12%,-55.50M,-,0.05,-,9.96%,-12.26%,1.06B,2.12,-328.10%,25.95%,2.32,17.72%,12.61,0.66,650.00%,-0.90%,12.64,-38.73%,0.03,264.87,-,-1.90%,6.69 - 15.27,-0.48%,-,-,-28.70%,0.00%,-45.17%,2.20,-,0.70,16.40%,67.80%,25.11%,0.41,477,0.80,71.90%,5.30%,52.71,4.83% 5.00%,Yes,0.80,7.80%,-5.20%,0.96,7.78,Yes,0.80,Feb 27 AMC,-,11.31M,8.37,2.20,0.99%,-1.63%,-4.72%,"10,843,026",7.58% If you only want your file to contain data from one run of the script, you do not need a to append, just use w instead.
Writing scraped data to csv
I've been stuck trying to transfer the data that I scraped to a csv file. Here is my code: import requests, bs4, csv, sys reload(sys) sys.setdefaultencoding('utf-8') url = 'http://www.constructeursdefrance.com/resultat/?dpt=01' res = requests.get(url) res.raise_for_status() soup = bs4.BeautifulSoup(res.text,'html.parser') links = [] for div in soup.select('.link'): link = div.a.get('href') links.append(link) for i in links: url2 = i res2 = requests.get(url2) soup2 = bs4.BeautifulSoup(res2.text, 'html.parser') for each in soup2.select('li > strong'): data = each.text, each.next_sibling with open('french.csv', 'wb') as file: writer = csv.writer(file) writer.writerows(data) the output says: Traceback (most recent call last): File "test_new_project.py", line 23, in <module> writer.writerows(data) csv.Error: sequence expected But I am trying to put tuples into the csv file, and as long as I know csv accepts tuples and lists. How can I fix this problem?
Atirag is correct, but you have another issue which is that your with call to open the output file is nested within a for loop. So if there is more than one link, the file will be overwritten each time and the output will not be what you expect. I think this should generate the output you intend: for div in soup.select('.link'): link = div.a.get('href') links.append(link) with open("french.csv", "w") as file: writer = csv.writer(file) for i in links: res2 = requests.get(i) soup2 = bs4.BeautifulSoup(res2.text, 'html.parser') for each in soup2.select('li > strong'): writer.writerow([each.text, each.next_sibling])
Change this for each in soup2.select('li > strong'): data = each.text, each.next_sibling to this data=[] for each in soup2.select('li > strong'): data.append((each.text, each.next_sibling)) your data variable was one tuple not a list of tuples. The above code creates a list of tuples. Other solution is this (mind the indentation) data = [] for i in links: url2 = i res2 = requests.get(url2) soup2 = bs4.BeautifulSoup(res2.text, 'html.parser') for each in soup2.select('li > strong'): data.append((each.text, each.next_sibling)) with open('french.csv', 'wb') as file: writer = csv.writer(file) writer.writerows(data)