I am having some issues while saving rows in a csv file after web scraping. I used the same notation and it worked well before on another site but now the csv file is blank. It seems python is not writing any row.
I show you my code, thanks in advance:
import requests
from bs4 import BeautifulSoup
import csv
import lxml
html_page = requests.get('https://www.scrapethissite.com/pages/forms/?page_num=1').text
soup = BeautifulSoup(html_page, 'lxml')
# get the number of pages (it might change in the future as the data is updated)
pagenum = soup.find('ul', {'class': 'pagination'})
n = pagenum.findAll('li')[-2].find('a')['href'].split('=')[1]
# now we convert the value of the page in a range so that we can loop over it
page = range(1, int(n) + 1)
print(page)
with open('HockeyLeague.csv', 'w') as f:
csv_writer = csv.writer(f)
csv_writer.writerow(['team_name', 'year', 'wins', 'losses', 'win_perc', 'goal_for', 'goal_against'])
for p in page:
html_page = requests.get(f'https://www.scrapethissite.com/pages/forms/?page_num={p}&per_page=25').text
soup = BeautifulSoup(html_page, 'lxml')
table = soup.find('table', {'class': 'table'})
for row in table.findAll('tr', {'class': 'team'}):
# getting the wanted variables:
team_name = row.find('td', {'class': 'name'}).text
year = row.find('td', {'class': 'year'}).text
wins = row.find('td', {'class': 'wins'}).text
losses = row.find('td', {'class': 'losses'}).text
goal_for = row.find('td', {'gf'}).text
goal_against = row.find('td', {'ga'}).text
try:
win_perc = row.find('td', {'pct text-success'}).text
except:
win_perc = row.find('td', {'pct text-danger'}).text
# write the data in the csv file we created at the beginning
csv_writer.writerow([team_name, year, wins, losses, win_perc, goal_for, goal_against])
Cause script in general is working these are just some things you should keep in mind:
I would recommend opening the file with newline='' on all platforms
to disable universal newlines translation and encoding='utf-8' to
be sure you are working on the "correct" one:
with open('HockeyLeague.csv', 'w', newline='', encoding='utf-8') as f:
...
.strip() your texts or use .get_text(strip=True) to get a clean
output and avoid linebreaks you do not wont.
team_name = row.find('td', {'class': 'name'}).text.strip()
year = row.find('td', {'class': 'year'}).text.strip()
...
In newer code avoid old syntax findAll() instead use find_all() -
For more take a minute to check
docs
Alternative Example
Uses a while loop the checks the "Next Button" and extract its url, also stripped_strings to extract the texts from each row:
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.scrapethissite.com/pages/forms/'
with open('HockeyLeague.csv', 'w', newline='', encoding='utf-8') as f:
csv_writer = csv.writer(f)
csv_writer.writerow(['team_name', 'year', 'wins', 'losses', 'win_perc', 'goal_for', 'goal_against'])
while True:
html_page = requests.get(url).text
soup = BeautifulSoup(html_page)
for row in soup.find_all('tr', {'class': 'team'}):
# write the data in the csv file we created at the beginning
csv_writer.writerow(list(row.stripped_strings)[:-1])
if soup.select_one('.pagination a[aria-label="Next"]'):
url = 'https://www.scrapethissite.com'+soup.select_one('.pagination a[aria-label="Next"]').get('href')
else:
break
Output
team_name,year,wins,losses,win_perc,goal_for,goal_against
Boston Bruins,1990,44,24,0.55,299,264
Buffalo Sabres,1990,31,30,0.388,292,278
Calgary Flames,1990,46,26,0.575,344,263
Chicago Blackhawks,1990,49,23,0.613,284,211
Detroit Red Wings,1990,34,38,0.425,273,298
Edmonton Oilers,1990,37,37,0.463,272,272
...
Related
I'm a newbie to python and am just teaching myself how to code and scrape data, hoping someone can explain what Im doing wrong or why from the following script do I only get the headers, but no data is inserted into the text file?
Is it because its returning none or empty data fields from the scrape? or am I missing something in my logic
Not getting any errors from the code it would seem see idle out image below
code
# import necessary libraries
import requests
from bs4 import BeautifulSoup
import csv
import datetime
# get the current date
date = datetime.datetime.now().strftime("%Y%m%d")
# create the output file
filename = 'C:/Users/AJS2/Documents/datafiles_test/' + date + '.txt'
with open(filename, 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
writer.writerow(['url', 'name', 'address', 'phone', 'email'])
# define the base url
base_url = "http://www.wedding-planners.com/index.cfm"
# get the page
page = requests.get(base_url)
# create a beautifulsoup object
soup = BeautifulSoup(page.content, 'html.parser')
# get the next button
next_button = soup.find('a', class_='Next')
# set the counter
counter = 0
# loop through the pages
while next_button and counter < 50:
# get the list of wedding planners
wedding_planners = soup.find_all('div', class_='plannerName')
# loop through the list of wedding planners
for planner in wedding_planners:
# get the url
url = planner.a['href']
# get the page
page = requests.get(url)
# create a beautifulsoup object
soup = BeautifulSoup(page.content, 'html.parser')
# get the name
name = soup.find('h1', class_='head1').text
# get the address
address = soup.find('span', class_='address').text
# get the phone
phone = soup.find('span', class_='phone').text
# get the email
email = soup.find('span', class_='email').text
# save the data
with open(filename, 'a', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
writer.writerow([url, name, address, phone, email])
# increment the counter
counter += 1
# get the next page
page = requests.get(next_button['href'])
# create a beautifulsoup object
soup = BeautifulSoup(page.content, 'html.parser')
# get the next button
next_button = soup.find('a', class_='Next')
print('Finished scraping')
So I want to scrape some phone numbers from a site. The only problem is that they are hidden behind a click. I can't go and click all of them to make them scrape-able so I wanted to ask if there is any way to get them from the 'data-phone' attribute inside the span tag.
I tried to use data_='data-phone' but that doesn't work.
from bs4 import BeautifulSoup
import requests
import csv
source = requests.get('https://software-overzicht.nl/amersfoort?page=1').text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('cms_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['title, location'])
for number in soup.find_all('span', data_='data-phone'):
print(number)
for info in soup.find_all('div', class_='company-info-top'):
title = info.a.text
location = info.p.text
csv_writer.writerow([title, location])
csv_file.close()
change
for number in soup.find_all('span', data_='data-phone'):
print(number)
to
for number in soup.find_all('span', class_='phone'):
print(number['data-phone'])
Output:
0334226800
0878739737
0334558584
0334798200
0334720311
0334677050
0334554948
0334535384
0337767840
0334560292
0626214363
0334559065
0334506506
0620423525
0334556166
0332012581
0334557485
0334946111
0334536200
0334545111
0334545430
0337851805
033-4721544
06-26662490
To incorporate that into your csv:
from bs4 import BeautifulSoup
import requests
import csv
with open('C:/cms_scrape.csv','w', newline='') as f:
csv_writter = csv.writer(f)
csv_writter.writerow(['naambedrijf', 'adress', 'phone'])
for page in range(1, 22):
url = 'https://software-overzicht.nl/amersfoort?page={}'.format(page)
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
for search in soup.find_all('div', class_='company-info-top'):
title = search.a.text.strip()
adress = search.p.text.strip()
try:
phone = search.find('span', {'class':'phone'})['data-phone']
except:
phone = 'N/A'
print(title)
csv_writter.writerow([title,adress,phone])
I have managed to build a very primitive program to scrape vehicle data from pistonheads and print it to a .csv file with the link, make, model and am working on getting the price which is where I am encountering a problem.
I want to scrape the prices to the fourth column in my .csv file (Price) and to correctly print the prices from each vehicle on the website.
I am only getting it to print the price from one vehicle and repeat it again and again next to each vehicle in the .csv file.
I have tried soup.findAll and soup.find_all to see whether parsing through multiple elements would work but this is just creating a bigger mess.
Might someone be able to help?
I am also trying to scrape the image src and would like to print that on another column (5) called images.
import csv ; import requests
from bs4 import BeautifulSoup
outfile = open('pistonheads.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Link", "Make", "Model", "Price"])
url = 'https://www.pistonheads.com/classifieds?Category=used-cars&Page=1&ResultsPerPage=100'
get_url = requests.get(url)
get_text = get_url.text
soup = BeautifulSoup(get_text, 'html.parser')
car_link = soup.find_all('div', 'listing-headline', 'price')
for div in car_link:
links = div.findAll('a')
for a in links:
link = ("https://www.pistonheads.com" + a['href'])
make = (a['href'].split('/')[-4])
model = (a['href'].split('/')[-3])
price = soup.find('span')
writer.writerow([link, make, model, price])
print(link, make, model, price)
outfile.close()
You can try this:
import csv, requests, re
from urllib.parse import urlparse
from bs4 import BeautifulSoup as soup
d = soup(requests.get('https://www.pistonheads.com/classifieds?Category=used-cars&ResultsPerPage=100').text, 'html.parser')
def extract_details(_s:soup) -> list:
_link = _s.find('a', {'href':re.compile('/classifieds/used\-cars/')})['href']
_, _, make, model, *_ = _link[1:].split('/')
price, img = _s.find('div', {'class':'price'}).text, [i['src'] for i in _s.find_all('img')]
return [_link, make, model, price, 'N/A' if not img else img[0]]
with open('filename.csv', 'w') as f:
_listings = [extract_details(i) for i in d.find_all('div', {'class':'ad-listing'}) if i.find('div', {'class':'price'})]
write = csv.writer(f)
write.writerows([['make', 'model', 'price', 'img'], *_listings])
The reason is because of price = soup.find('span')
.find() will grab the first element it finds. And you have it looking into your soup object. But where you want it to look, is within your a, because that's what you are looping through with for a in links:
I also add .text as I am assuming you just want the text, not the whole tag element. Ie price = a.find('span').text
import csv ; import requests
from bs4 import BeautifulSoup
outfile = open('pistonheads.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Link", "Make", "Model", "Price", 'Images'])
url = 'https://www.pistonheads.com/classifieds?Category=used-cars&Page=1&ResultsPerPage=100'
get_url = requests.get(url)
get_text = get_url.text
soup = BeautifulSoup(get_text, 'html.parser')
car_link = soup.find_all('div', 'listing-headline', 'price')
for div in car_link:
links = div.findAll('a')
for a in links:
link = ("https://www.pistonheads.com" + a['href'])
make = (a['href'].split('/')[-4])
model = (a['href'].split('/')[-3])
price = a.find('span').text
image_link = a.parent.parent.find('img')['src']
image = link + image_link
writer.writerow([link, make, model, price, image])
print(link, make, model, price, image)
outfile.close()
I want scraping the exchange prices informations from this website and after take it into a database: https://www.mnb.hu/arfolyamok
I wrote this code, but something wrong with it. How can i fix it, where i have to change it?
I am working with Python 2.7.13 on Windows 7.
The code is here:
import csv
import requests
from BeautifulSoup import BeautifulSoup
url = 'https://www.mnb.hu/arfolyamok'
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
table = soup.find('tbody', attrs={'class': 'stripe'})
list_of_rows = []
for row in table.findAll('tr')[1:]:
list_of_cells = []
for cell in row.findAll('td'):
text = cell.text.replace(' ', '')
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
print list_of_rows
outfile = open("./inmates.csv", "wb")
writer = csv.writer(outfile)
writer.writerow(["Pénznem", "Devizanév", "Egység", "Forintban kifejezett érték"])
writer.writerows(list_of_rows)
Add # coding=utf-8 to the top of your code. This will help solve the SyntaxError you are receiving. Also make sure your indentation is correct!
So i have a working code that pulls data from 30 websites on a domain.
with open("c:\source\list.csv") as f:
for row in csv.reader(f):
for url in row:
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
tables = soup.find('table', attrs={"class": "hpui-standardHrGrid-table"})
for rows in tables.find_all('tr', {'releasetype': 'Current_Releases'})[0::1]:
item = []
for val in rows.find_all('td'):
item.append(val.text.strip())
with open('c:\source\output_file.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow({url})
writer.writerows(item)
When I open the CSV file, I see each character taken from the 'Item' variable is stored in its own cell. I can't seem to find out what the heck is doing this and how to fix it.
Any thoughts?
I fixed this by changing
writer.writerows(item)
to
writer.writerow(item)