Iteration Url Load from CSV in Python - python

Please Help me
I have a data url in the CSV file, in that file there are 100 rows and 1 column,
I want to load data line 1 to line 100 from CSV using Python, how do I write the code line?
However, after running the repetition can only work once in one of the lines does not reach the end of the url in the CSV and does not continue to the next URL.
disc_information = html.find('div', class_='alert alert-info global-promo').text.strip().strip('\n')
AttributeError: 'NoneType' object has no attribute 'text'
how do I get through if an error occurs when html is not found?
the following line of code I use python, please help so that the looping scrape runs to the end of the url list
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv
import pandas
with open('Url Torch.csv','rt') as f:
data = csv.reader(f, delimiter=',')
for row in data:
URL_GO = row[2]
def variable_Scrape(url):
try:
cookies = dict(cookie="............")
request = requests.get(url, cookies=cookies)
html = BeautifulSoup(request.content, 'html.parser')
title = html.find('div', class_='title').text.strip().strip('\n')
desc = html.find('div', class_='content').text
link = html.find_all('img', class_='lazyload slide-item owl-lazy')
normal_price = html.find('div', class_='amount public').text.strip().strip('\n')
disc_information = html.find('div', class_='alert alert-info global-promo').text.strip().strip('\n')
except AttributeError as e:
print(e)
#ConnectionAbortedError
return False
else:
print(title)
#print(desc)
#print(link)
finally:
print(title)
print(desc)
print(link)
print('Finally.....')
variable_Scrape(URL_GO)

Is hard to give you the exact answer without seeing you csv file but try this:
import csv
f = open('you_file.csv')
csv_f = csv.reader(f)
for row in csv_f:
print row[0]

This is the code
import csv
data = [] #create an empty list to store rows on it
with open('emails.csv') as csv_file:
reader = csv.reader(csv_file)
for row in reader:
data.append(row) #add each row to the list
Based on your comments about passing a loop when the url is not ok:
for url in data: # data is the list where url stored
try:
# do your code here (requests, beautifulsoup) :
# r = requests.get(url) ...
except:
pass
# will go to the next loop (next url) if an error happens

Related

CSV file being exported empty and only the headers are showing?

So Im learning more about python everyday. Im doing a mini web scrape project and at the very end when I should see the results on an exported csv - it comes up blank except for the headers. Any help is gladly appreciated! Thanks.
The code is below:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.boxofficemojo.com/year/"
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, "html.parser")
box_office_table = soup.find("div", class_="a-section mojo-body aok-relative").find_all("tr")
with open('imdbmovies.csv', 'a', newline='') as csvfile:
writer = csv.writer(csvfile)
# Write headers to CSV file
writer.writerow(['numone_release', 'year', 'total_gross', 'releases', 'average', 'gross_change'])
for row in box_office_table:
try:
year_cell = row.find("td", class_="a-text-left mojo-header-column mojo-field-type-year mojo-sort-column")
money_cells = row.find_all("td", class_="a-text-right mojo-field-type-money")
releases_cell = row.find("td", class_="a-text-right mojo-field-type-positive_integer")
gross_change_cell = row.find("td", class_="a-text-right mojo-number-delta mojo-field-type-percent_delta")
numone_release_cell = row.find("td", class_="a-text-left mojo-field-type-release mojo-cell-wide")
if len(money_cells) >= 2 and year_cell is not None and releases_cell is not None and gross_change_cell is not None and numone_release_cell is not None:
total_gross_cell = money_cells[0]
average_cell = money_cells[1]
year = year_cell.text.strip()
total_gross = total_gross_cell.text.strip()
releases = releases_cell.text.strip()
average = average_cell.text.strip()
gross_change = gross_change_cell.text.strip()
numone_release = numone_release_cell.text.strip()
print(year, total_gross, releases, average, gross_change, numone_release)
# Write the row to the CSV file
writer.writerow([numone_release, year, total_gross, releases, average, gross_change])
except AttributeError:
# Either a cell is not found
pass

csv.writer not writing entire output to CSV file

I am attempting to scrape the artists' Spotify streaming rankings from Kworb.net into a CSV file and I've nearly succeeded except I'm running into a weird issue.
The code below successfully scrapes all 10,000 of the listed artists into the console:
import requests
from bs4 import BeautifulSoup
import csv
URL = "https://kworb.net/spotify/artists.html"
result = requests.get(URL)
src = result.content
soup = BeautifulSoup(src, 'html.parser')
table = soup.find('table', id="spotifyartistindex")
header_tags = table.find_all('th')
headers = [header.text.strip() for header in header_tags]
rows = []
data_rows = table.find_all('tr')
for row in data_rows:
value = row.find_all('td')
beautified_value = [dp.text.strip() for dp in value]
print(beautified_value)
if len(beautified_value) == 0:
continue
rows.append(beautified_value)
The issue arises when I use the following code to save the output to a CSV file:
with open('artist_rankings.csv', 'w', newline="") as output:
writer = csv.writer(output)
writer.writerow(headers)
writer.writerows(rows)
For whatever reason, only 738 of the artists are saved to the file. Does anyone know what could be causing this?
Thanks so much for any help!
As an alternative approach, you might want to make your life easier next time and use pandas.
Here's how:
import requests
import pandas as pd
source = requests.get("https://kworb.net/spotify/artists.html")
df = pd.concat(pd.read_html(source.text, flavor="bs4"))
df.to_csv("artists.csv", index=False)
This outputs a .csv file with 10,000 artists.

Can't write in csv file

When I try to write the information in the csv file, error is thrown:
Traceback (most recent call last):
File "sizeer.py", line 68, in <module>
writer.writerow([name,color,price])
ValueError: I/O operation on closed file
import requests
import csv
from bs4 import BeautifulSoup
proxies = {
"http":"http://195.189.60.97:3128",
"http":"http://103.78.75.165:8080",
"http":"http://212.87.220.2:3128",
"http":"http://88.99.134.61:8080",
"http":"http://103.102.139.178:8080",
"http":"http://218.60.8.83:3129",
"http":"http://124.121.105.193:8888",
"http":"http://198.237.114.54:8080",
"http":"http://36.67.106.58:8080",
"http":"http://35.214.241.28:3128"
}
base_url = ...
page = requests.get(base_url, proxies=proxies)
if page.status_code != 200:
exit("Page wasn't parsed")
soup = BeautifulSoup(page.content, 'lxml')
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
#Get categories
category_wrapper = soup.find_all(class_="m-menu_subItem")
categories = []
for cw in category_wrapper:
anchor = cw.find("a", recursive=False)
categories.append(anchor['href'])
#Iterrate categories
for category in categories:
cat_page = requests.get(base_url + category, proxies=proxies)
cat_soup = BeautifulSoup(cat_page.content, 'lxml')
products_wrapper = cat_soup.find(class_="b-productList")
cat_pagination = products_wrapper.find(class_="m-pagination").find_all("span")
max_page = [int(s) for s in cat_pagination[-1].text.split() if s.isdigit()][0]
#Iterrate category with pagination and get products
for i in range(1, max_page+1):
cat_pagination_page = requests.get(base_url+category+"/?sort=default&limit=60&page="+str(i), proxies=proxies)
cat_pagination_page_soup = BeautifulSoup(cat_pagination_page.content, 'lxml')
product_links = cat_pagination_page_soup.find_all(class_="b-itemList_photoLink")
for link in product_links:
#Get product data
product_page = requests.get(base_url+link['href'], proxies=proxies)
product_soup = BeautifulSoup(product_page.content, 'lxml')
#Get product variations
variations = product_soup.find_all(class_="m-productDescr_colorItem")
#If there are variations
if len(variations) > 0:
for v in variations:
variation_page = requests.get(base_url+v['href'], proxies=proxies)
variation_soup = BeautifulSoup(variation_page.content, 'lxml')
price = variation_soup.find(class_="s-newPrice").text.strip().split(" ")[0]
name = variation_soup.find(class_="m-productDescr_headline").text.strip()
color = v['title']
print(name)
print(color)
print(price)
print("-------------")
#Save in csv
writer.writerow([name,color,price])
print("SCRAPING DONE")
How to keep the file open through the whole script execution ? Or I have to open it every time I am adding content ? EDIT In fact, the file is not even created.
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
The file closes at the end of the with block - that is the block's purpose.
You could put everything inside the block, but that only makes the existing problem worse: the code is reaching several levels of indents, is long and becomes difficult to understand. This is why you use functions to organize the code. For example, if you have the big for loop set in a function:
def do_stuff_with(categories, writer):
for category in categories:
# lots of logic here
# use `writer.writerow` when needed
# Get everything else set up that doesn't need the file, first
categories = ... # do the BeautifulSoup input stuff
# then we can open the file and use the function:
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
do_stuff_with(categories, writer)
Once you have that working, you can probably think of ways to apply the technique further. For example, pull out the innermost logic, for handling the variations for a single product. Or you can have a function to handle the creation of the categories data, and return it.

How do I creat CSV file with webscraped content from several URLs?

I want to create a CSV file from webscraped content. The content is from FinViz.com
I want to scrape the table from this website 20 times for 20 different stocks and input all the content into a CSV file. Within my code, I generate a list of stocks from a scrape of twitter content. The list of stocks that is generated is the same list that I want to get information on from the FinViz.com tables.
Here is my code:
import csv
import urllib.request
from bs4 import BeautifulSoup
twiturl = "https://twitter.com/ACInvestorBlog"
twitpage = urllib.request.urlopen(twiturl)
soup = BeautifulSoup(twitpage,"html.parser")
print(soup.title.text)
tweets = [i.text for i in soup.select('a.twitter-cashtag.pretty-link.js-nav b')]
print(tweets)
url_base = "https://finviz.com/quote.ashx?t="
url_list = [url_base + tckr for tckr in tweets]
for url in url_list:
fpage = urllib.request.urlopen(url)
fsoup = BeautifulSoup(fpage, 'html.parser')
# scrape single page and add data to list
# write datalist
with open('today.csv', 'a') as file:
writer = csv.writer(file)
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
The trouble that I am running into is that my CSV file only has the webscraped data from the last item in the list. Instead I want the entire list in a sequence of rows.
Here is what my CSV file looks like:
Index,P/E,EPS (ttm),Insider Own,Shs Outstand,Perf Week,Market Cap,Forward P/E,EPS next Y,Insider Trans,Shs Float,Perf Month,Income,PEG,EPS next Q,Inst Own,Short Float,Perf Quarter,Sales,P/S,EPS this Y,Inst Trans,Short Ratio,Perf Half Y,Book/sh,P/B,EPS next Y,ROA,Target Price,Perf Year,Cash/sh,P/C,EPS next 5Y,ROE,52W Range,Perf YTD,Dividend,P/FCF,EPS past 5Y,ROI,52W High,Beta,Dividend %,Quick Ratio,Sales past 5Y,Gross Margin,52W Low,ATR,Employees,Current Ratio,Sales Q/Q,Oper. Margin,RSI (14),Volatility,Optionable,Debt/Eq,EPS Q/Q,Profit Margin,Rel Volume,Prev Close,Shortable,LT Debt/Eq,Earnings,Payout,Avg Volume,Price,Recom,SMA20,SMA50,SMA200,Volume,Change
-,-,-1.75,7.94%,79.06M,-22.52%,296.48M,-,-1.74,-4.61%,72.41M,-23.16%,-85.70M,-,-0.36,62.00%,3.21%,1.63%,15.10M,19.63,-197.00%,18.05%,2.57,66.67%,-0.65,-,-8.10%,-127.70%,12.17,-6.25%,0.93,4.03,-,146.70%,2.05 - 5.86,3.59%,-,-,-,385.80%,-36.01%,-,-,1.30,-,76.50%,82.93%,0.41,100,1.30,-59.60%,-,36.98,16.13% 9.32%,Yes,-,90.00%,-,0.82,3.63,Yes,-,Nov 08,-,902.43K,3.75,2.30,-22.08%,-10.43%,11.96%,"742,414",3.31%
It would be better to open your output file first, rather than keep on opening/closing it for each URL that you fetch. Exception handling is needed to catch cases where the URL does not exist.
Also on your output, you should open the file with newline='' to avoid extra empty lines being written to the file:
import csv
import urllib.request
from bs4 import BeautifulSoup
write_header = True
twiturl = "https://twitter.com/ACInvestorBlog"
twitpage = urllib.request.urlopen(twiturl)
soup = BeautifulSoup(twitpage,"html.parser")
print(soup.title.text)
tweets = [i.text for i in soup.select('a.twitter-cashtag.pretty-link.js-nav b')]
print(tweets)
url_base = "https://finviz.com/quote.ashx?t="
url_list = [url_base + tckr for tckr in tweets]
with open('today.csv', 'w', newline='') as file:
writer = csv.writer(file)
for url in url_list:
try:
fpage = urllib.request.urlopen(url)
fsoup = BeautifulSoup(fpage, 'html.parser')
# write header row (once)
if write_header:
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
write_header = False
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except urllib.error.HTTPError:
print("{} - not found".format(url))
So today.csv would start like:
Index,P/E,EPS (ttm),Insider Own,Shs Outstand,Perf Week,Market Cap,Forward P/E,EPS next Y,Insider Trans,Shs Float,Perf Month,Income,PEG,EPS next Q,Inst Own,Short Float,Perf Quarter,Sales,P/S,EPS this Y,Inst Trans,Short Ratio,Perf Half Y,Book/sh,P/B,EPS next Y,ROA,Target Price,Perf Year,Cash/sh,P/C,EPS next 5Y,ROE,52W Range,Perf YTD,Dividend,P/FCF,EPS past 5Y,ROI,52W High,Beta,Dividend %,Quick Ratio,Sales past 5Y,Gross Margin,52W Low,ATR,Employees,Current Ratio,Sales Q/Q,Oper. Margin,RSI (14),Volatility,Optionable,Debt/Eq,EPS Q/Q,Profit Margin,Rel Volume,Prev Close,Shortable,LT Debt/Eq,Earnings,Payout,Avg Volume,Price,Recom,SMA20,SMA50,SMA200,Volume,Change
-,-,-10.85,4.60%,2.36M,11.00%,8.09M,-,-,-62.38%,1.95M,-16.14%,-14.90M,-,-,2.30%,10.00%,-44.42%,0.00M,-,21.80%,-5.24%,3.10,-38.16%,1.46,2.35,-,-155.10%,65.00,-50.47%,-,-,-,-238.40%,2.91 - 11.20,-38.29%,-,-,54.50%,-,-69.37%,1.63,-,2.20,-,-,17.87%,0.36,15,2.20,-,-,39.83,11.38% 10.28%,No,0.00,68.70%,-,1.48,3.30,Yes,0.00,Feb 28 AMC,-,62.76K,3.43,1.00,-5.21%,-25.44%,-37.33%,"93,166",3.94%
-,-,-0.26,1.50%,268.98M,3.72%,2.25B,38.05,0.22,-0.64%,263.68M,-9.12%,-55.50M,-,0.05,-,9.96%,-12.26%,1.06B,2.12,-328.10%,25.95%,2.32,17.72%,12.61,0.66,650.00%,-0.90%,12.64,-38.73%,0.03,264.87,-,-1.90%,6.69 - 15.27,-0.48%,-,-,-28.70%,0.00%,-45.17%,2.20,-,0.70,16.40%,67.80%,25.11%,0.41,477,0.80,71.90%,5.30%,52.71,4.83% 5.00%,Yes,0.80,7.80%,-5.20%,0.96,7.78,Yes,0.80,Feb 27 AMC,-,11.31M,8.37,2.20,0.99%,-1.63%,-4.72%,"10,843,026",7.58%
If you only want your file to contain data from one run of the script, you do not need a to append, just use w instead.

How to write a new column to csv when webscraping?

I'd like to get some quick help on writing this webscraping program. So far it's scraping things correctly, but I'm having trouble writing it to a csv file.
I'm scraping two things from each reviewer: Review score AND written review
I'd like to write the review score into the first column, and the written review into the second column. However, writerow only does it row by row.
Appreciate any help on this! :)
import os, requests, csv
from bs4 import BeautifulSoup
# Get URL of the page
URL = ('https://www.tripadvisor.com/Attraction_Review-g294265-d2149128-Reviews-Gardens_by_the_Bay-Singapore.html')
with open('GardensbytheBay.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
# Looping until the 5th page of reviews
for pagecounter in range(3):
# Request get the first page
res = requests.get(URL)
res.raise_for_status
# Download the html of the first page
soup = BeautifulSoup(res.text, "html.parser")
# Match it to the specific tag for all 5 ratings
reviewElems = soup.findAll('img', {'class': ['sprite-rating_s_fill rating_s_fill s50', 'sprite-rating_s_fill rating_s_fill s40', 'sprite-rating_s_fill rating_s_fill s30', 'sprite-rating_s_fill rating_s_fill s20', 'sprite-rating_s_fill rating_s_fill s10']})
reviewWritten = soup.findAll('p', {'class':'partial_entry'})
if reviewElems:
for row, rows in zip(reviewElems, reviewWritten):
review_text = row.attrs['alt'][0]
review2_text = rows.get_text(strip=True).encode('utf8', 'ignore').decode('latin-1')
writer.writerow([review_text])
writer.writerow([review2_text])
print('Writing page', pagecounter + 1)
else:
print('Could not find clue.')
# Find URL of next page and update URL
if pagecounter == 0:
nextLink = soup.select('a[data-offset]')[0]
elif pagecounter != 0:
nextLink = soup.select('a[data-offset]')[1]
URL = 'http://www.tripadvisor.com' + nextLink.get('href')
print('Download complete')
You can put the review score and text in the same row but different columns with:
writer.writerow([review_text, review2_text])
Your initial approach takes each of the items as a separate row and writes them in succession which is not what you want.
You can use pandas dataFrame:
import pandas as pd
import numpy as np
csv_file = pd.read_csv('GardensbytheBay.csv')
csv_file.insert(idx, cloname, value)
csv_input.to_csv('output.csv', index=False)

Categories

Resources