Writing scraped data to csv - python

I've been stuck trying to transfer the data that I scraped to a csv file. Here is my code:
import requests, bs4, csv, sys
reload(sys)
sys.setdefaultencoding('utf-8')
url = 'http://www.constructeursdefrance.com/resultat/?dpt=01'
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text,'html.parser')
links = []
for div in soup.select('.link'):
link = div.a.get('href')
links.append(link)
for i in links:
url2 = i
res2 = requests.get(url2)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
data = each.text, each.next_sibling
with open('french.csv', 'wb') as file:
writer = csv.writer(file)
writer.writerows(data)
the output says:
Traceback (most recent call last):
File "test_new_project.py", line 23, in <module>
writer.writerows(data)
csv.Error: sequence expected
But I am trying to put tuples into the csv file, and as long as I know csv accepts tuples and lists. How can I fix this problem?

Atirag is correct, but you have another issue which is that your with call to open the output file is nested within a for loop. So if there is more than one link, the file will be overwritten each time and the output will not be what you expect. I think this should generate the output you intend:
for div in soup.select('.link'):
link = div.a.get('href')
links.append(link)
with open("french.csv", "w") as file:
writer = csv.writer(file)
for i in links:
res2 = requests.get(i)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
writer.writerow([each.text, each.next_sibling])

Change this
for each in soup2.select('li > strong'):
data = each.text, each.next_sibling
to this
data=[]
for each in soup2.select('li > strong'):
data.append((each.text, each.next_sibling))
your data variable was one tuple not a list of tuples. The above code creates a list of tuples.
Other solution is this (mind the indentation)
data = []
for i in links:
url2 = i
res2 = requests.get(url2)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
data.append((each.text, each.next_sibling))
with open('french.csv', 'wb') as file:
writer = csv.writer(file)
writer.writerows(data)

Related

Adding Data from Beautiful Soup table to a list

Hello I'm a beginner to python and programming in general, and I was wondering how I would make the outputted data a list. I used bs to extract data from a table and attempt to make a list with the data, but I end up only adding the first number to the list. Can someone provide me assistance and an explaination?
from bs4 import BeautifulSoup
from requests_html import HTMLSession
s = HTMLSession()
url = 'https://www.timeanddate.com/weather/usa/new-york/ext'
def get_data(url):
r = s.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
with open('document.txt', 'a') as f:
f.write(str(get_data(url)))
with open('document.txt', 'r') as html_file:
contents = html_file.read()
soup = BeautifulSoup(contents, 'lxml')
forecast_table = soup.find('table', class_ = 'zebra tb-wt fw va-m tb-hover')
wtitle = soup.title.text
print(wtitle)
print("------")
def get_weather_high(forecast_table):
print("Weather Highs:")
for high in forecast_table.find_all('tbody'):
rows1 = high.find_all('tr')
for row1 in rows1:
pl_high = row1.find_all('td')
pl_high = [td.text.strip() for td in pl_high]
pl_high = pl_high[1:2]
for pl_high_final in pl_high:
pl_high_final = pl_high_final[0:3]
print(pl_high_final)
get_weather_high(forecast_table)
This the output. Instead of each line being a number, I want to have it all under on list
Create a list before your for loop and just append your data instead of printing it and then just print the list after the for loop
data = []
def get_weather_high(forecast_table):
print("Weather Highs:")
for high in forecast_table.find_all('tbody'):
rows1 = high.find_all('tr')
for row1 in rows1:
pl_high = row1.find_all('td')
pl_high = [td.text.strip() for td in pl_high]
pl_high = pl_high[1:2]
for pl_high_final in pl_high:
pl_high_final = pl_high_final[0:3]
data.append(pl_high_final)
print(data) # or return data if you need it some where else

I'm trying to deduplicate weblinks scraped using Python & BeautifulSoup but it's not working

I'm trying to scrape a website in Python, I got the links to print but in trying to make them a set to deduplicate, there are still duplicates. Anyone have any advice on what I am doing wrong? Thanks in advance!
Edit: So I tried what John suggested but my csv output is a cascading list of links across the excel sheet, it's crazy...I'll post the changes below this original code:
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink) )
#output = The number of links to start with are: 254
import csv
with open('census_links.csv', 'w', newline='') as f:
weblinks = str(mylink)
writer = csv.writer(f, delimiter = ' ', lineterminator = '\r')
for link in mylink:
hrefs = str(link.get('href'))
if hrefs.startswith("None"):
''
elif hrefs.startswith('http'):
MySet = set()
MySet.add(hrefs)
elif hrefs.startswith('#'):
''
elif hrefs.startswith(' '):
''
print(set(MySet))
file.write(str(MySet)+'\n')
file.close
#Edited code:
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink))
# The number of links to start with are: 254
import csv
with open('census_links.csv', 'w', newline='') as f:
weblinks = str(mylink)
writer = csv.writer(f, delimiter = ',', lineterminator = '\r')
MySet = set()
for link in mylink:
hrefs = str(link.get('href'))
if hrefs.startswith("None"):
continue
elif hrefs.startswith('#'):
continue
elif hrefs.startswith(' '):
continue
elif hrefs.startswith('http'):
MySet.add(hrefs)
file.write(str(MySet)+'\n')
file.close
print(str(MySet) +'\n')
to get unique links, you want to check if the link is in MySet with hrefs not in MySet.
for simple operation you don't need csv, to write in single row
"\n".join(MySet)
and to write single column
",".join(MySet)
MySet = set()
for link in mylink:
hrefs = link.get('href')
if not hrefs or hrefs.startswith('#'):
continue
# normalize link
if hrefs.startswith('/'):
hrefs = 'https://www.census.gov' + hrefs
# check if link already in MySet
if hrefs not in MySet:
MySet.add(hrefs)
with open('census_links.csv', 'w', newline='') as f:
f.write("\n".join(MySet))
print("\n".join(MySet))
Initialize the set before the loop, and wait to print it until after the loop is done.
MySet = set()
...
for link in mylink:
hrefs = str(link.get('href'))
...
if hrefs.startswith('http'):
MySet.add(hrefs)
...
print(MySet)
same code part to get content.
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink) )
#output = The number of links to start with are: 254
use pandas to get the unique url which starts with http.
import pandas as pd
obj = pd.Series(mylink)
obj_link = obj.map(lambda x: x.get('href')).drop_duplicates().dropna()
cond = obj_link.str.startswith('http')
dfn = obj_link.loc[cond].to_frame()
dfn.shape # (93, 1)
dfn.to_csv('census_links.csv', index=False, header=False)

Can't write in csv file

When I try to write the information in the csv file, error is thrown:
Traceback (most recent call last):
File "sizeer.py", line 68, in <module>
writer.writerow([name,color,price])
ValueError: I/O operation on closed file
import requests
import csv
from bs4 import BeautifulSoup
proxies = {
"http":"http://195.189.60.97:3128",
"http":"http://103.78.75.165:8080",
"http":"http://212.87.220.2:3128",
"http":"http://88.99.134.61:8080",
"http":"http://103.102.139.178:8080",
"http":"http://218.60.8.83:3129",
"http":"http://124.121.105.193:8888",
"http":"http://198.237.114.54:8080",
"http":"http://36.67.106.58:8080",
"http":"http://35.214.241.28:3128"
}
base_url = ...
page = requests.get(base_url, proxies=proxies)
if page.status_code != 200:
exit("Page wasn't parsed")
soup = BeautifulSoup(page.content, 'lxml')
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
#Get categories
category_wrapper = soup.find_all(class_="m-menu_subItem")
categories = []
for cw in category_wrapper:
anchor = cw.find("a", recursive=False)
categories.append(anchor['href'])
#Iterrate categories
for category in categories:
cat_page = requests.get(base_url + category, proxies=proxies)
cat_soup = BeautifulSoup(cat_page.content, 'lxml')
products_wrapper = cat_soup.find(class_="b-productList")
cat_pagination = products_wrapper.find(class_="m-pagination").find_all("span")
max_page = [int(s) for s in cat_pagination[-1].text.split() if s.isdigit()][0]
#Iterrate category with pagination and get products
for i in range(1, max_page+1):
cat_pagination_page = requests.get(base_url+category+"/?sort=default&limit=60&page="+str(i), proxies=proxies)
cat_pagination_page_soup = BeautifulSoup(cat_pagination_page.content, 'lxml')
product_links = cat_pagination_page_soup.find_all(class_="b-itemList_photoLink")
for link in product_links:
#Get product data
product_page = requests.get(base_url+link['href'], proxies=proxies)
product_soup = BeautifulSoup(product_page.content, 'lxml')
#Get product variations
variations = product_soup.find_all(class_="m-productDescr_colorItem")
#If there are variations
if len(variations) > 0:
for v in variations:
variation_page = requests.get(base_url+v['href'], proxies=proxies)
variation_soup = BeautifulSoup(variation_page.content, 'lxml')
price = variation_soup.find(class_="s-newPrice").text.strip().split(" ")[0]
name = variation_soup.find(class_="m-productDescr_headline").text.strip()
color = v['title']
print(name)
print(color)
print(price)
print("-------------")
#Save in csv
writer.writerow([name,color,price])
print("SCRAPING DONE")
How to keep the file open through the whole script execution ? Or I have to open it every time I am adding content ? EDIT In fact, the file is not even created.
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
The file closes at the end of the with block - that is the block's purpose.
You could put everything inside the block, but that only makes the existing problem worse: the code is reaching several levels of indents, is long and becomes difficult to understand. This is why you use functions to organize the code. For example, if you have the big for loop set in a function:
def do_stuff_with(categories, writer):
for category in categories:
# lots of logic here
# use `writer.writerow` when needed
# Get everything else set up that doesn't need the file, first
categories = ... # do the BeautifulSoup input stuff
# then we can open the file and use the function:
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
do_stuff_with(categories, writer)
Once you have that working, you can probably think of ways to apply the technique further. For example, pull out the innermost logic, for handling the variations for a single product. Or you can have a function to handle the creation of the categories data, and return it.

Writing data scraped from a HTML table to a CSV file

I'm trying to figure out what will be the next step to convert my webscrape to CSV.
I've tried putting every column into individual lists, but I feel like this is not the solution.
from bs4 import BeautifulSoup
import requests
url = 'https://www.pro-football-reference.com/years/2018/passing.htm'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
for row in tb.find_all('tr'):
i = row.get_text()
print(i)
This should work
import csv #quite crucial
final_table = []
for row in tb.findall('tr'):
next_line = row.get_text()
final_table.append([next_line])
with open('output.csv', 'w') as f:
writer = csv.writer(f)
writer.writerows(final_table)
Use the csv module. We'll grab the headers with soup.find("tr").find_all("th"), then loop over the body and write it to the text file. The first cell of each row is a <th>, so we need to handle that separately and prepend it to the <td> data. Note that the staggered headers every 30 lines are omitted.
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.pro-football-reference.com/years/2018/passing.htm"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
with open("output.csv", "w") as f:
writer = csv.writer(f)
writer.writerow([x.get_text() for x in soup.find("tr").find_all("th")])
for row in soup.find_all("tr"):
data = [row.find("th").get_text()] + [x.get_text() for x in row.find_all("td")]
if data:
writer.writerow(data)
Output (just the top few rows):
Rk,Player,Tm,Age,Pos,G,GS,QBrec,Cmp,Att,Cmp%,Yds,TD,TD%,Int,Int%,Lng,Y/A,AY/A,Y/C,Y/G,Rate,QBR,Sk,Yds,NY/A,ANY/A,Sk%,4QC,GWD
1,Ben Roethlisberger,PIT,36,QB,16,16,9-6-1,452,675,67.0,5129,34,5.0,16,2.4,97,7.6,7.5,11.3,320.6,96.5,71.0,24,166,7.10,7.04,3.4,2,3
2,Andrew Luck*,IND,29,QB,16,16,10-6-0,430,639,67.3,4593,39,6.1,15,2.3,68,7.2,7.4,10.7,287.1,98.7,69.4,18,134,6.79,6.95,2.7,3,3
3,Matt Ryan,ATL,33,QB,16,16,7-9-0,422,608,69.4,4924,35,5.8,7,1.2,75,8.1,8.7,11.7,307.8,108.1,68.5,42,296,7.12,7.71,6.5,1,1
4,Kirk Cousins,MIN,30,QB,16,16,8-7-1,425,606,70.1,4298,30,5.0,10,1.7,75,7.1,7.3,10.1,268.6,99.7,58.2,40,262,6.25,6.48,6.2,1,0
Check this thread if you see extra newlines in the CSV result on Windows.

Beautiful Soup - Results to CSV for all items in lists

The below snippet "works" but is only outputting the first record to the CSV. I'm trying to get it to output the same output, but for each gun in the list of gun urls in the all_links list.
Any modification i've made to it with prints for the output (just to see it working) prints the same result
or if i make a gun_details list and try to print it, get the same one item output.
How would i go about printing all the gun_details labels and spans into a CSV?
import csv
import urllib.request
import requests
from bs4 import BeautifulSoup
all_links = []
url = "https://www.guntrader.uk/dealers/minsterley/minsterley-ranges/guns?page={}"
for page in range(1, 3):
res = requests.get(url).text
soup = BeautifulSoup(res, "html.parser")
for link in soup.select(
'a[href*="dealers/minsterley/minsterley-ranges/guns/shotguns/"]'
):
all_links.append("https://www.guntrader.uk" + link["href"])
for a_link in all_links:
gun_label = []
gun_span = []
res = urllib.request.urlopen(a_link)
# res = requests.get(a_link)
soup = BeautifulSoup(res, "html.parser")
for gun_details in soup.select("div.gunDetails"):
for l in gun_details.select("label"):
gun_label.append(l.text.replace(":", ""))
for s in gun_details.select("span"):
gun_span.append(s.text)
my_dict = dict(zip(gun_label, gun_span))
with open("mycsvfile.csv", "w") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=None)
for key in my_dict.keys():
csvfile.write(f"{key},{my_dict[key]}\n")
Try running the middle section this way:
for a_link in all_links:
gun_label = []
gun_span = []
res = requests.get(a_link)
soup = bs(res.content, 'html.parser') #note it's 'res.content', not just 'res'
for gun_details in soup.select('div.gunDetails'):
for l in gun_details.select('label'):
gun_label.append(l.text.replace(':',''))
for s in gun_details.select('span'):
gun_span.append(s.text)
#this block is now indented differently - it's INSIDE the 'for' loop
my_dict = dict(zip(gun_label, gun_span))
with open('mycsvfile.csv', 'a') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=None)
for key in my_dict.keys():
csvfile.write(f"{key},{my_dict[key]}\n")

Categories

Resources