Scraping only returns header not details - python

I'm a newbie to python and am just teaching myself how to code and scrape data, hoping someone can explain what Im doing wrong or why from the following script do I only get the headers, but no data is inserted into the text file?
Is it because its returning none or empty data fields from the scrape? or am I missing something in my logic
Not getting any errors from the code it would seem see idle out image below
code
# import necessary libraries
import requests
from bs4 import BeautifulSoup
import csv
import datetime
# get the current date
date = datetime.datetime.now().strftime("%Y%m%d")
# create the output file
filename = 'C:/Users/AJS2/Documents/datafiles_test/' + date + '.txt'
with open(filename, 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
writer.writerow(['url', 'name', 'address', 'phone', 'email'])
# define the base url
base_url = "http://www.wedding-planners.com/index.cfm"
# get the page
page = requests.get(base_url)
# create a beautifulsoup object
soup = BeautifulSoup(page.content, 'html.parser')
# get the next button
next_button = soup.find('a', class_='Next')
# set the counter
counter = 0
# loop through the pages
while next_button and counter < 50:
# get the list of wedding planners
wedding_planners = soup.find_all('div', class_='plannerName')
# loop through the list of wedding planners
for planner in wedding_planners:
# get the url
url = planner.a['href']
# get the page
page = requests.get(url)
# create a beautifulsoup object
soup = BeautifulSoup(page.content, 'html.parser')
# get the name
name = soup.find('h1', class_='head1').text
# get the address
address = soup.find('span', class_='address').text
# get the phone
phone = soup.find('span', class_='phone').text
# get the email
email = soup.find('span', class_='email').text
# save the data
with open(filename, 'a', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
writer.writerow([url, name, address, phone, email])
# increment the counter
counter += 1
# get the next page
page = requests.get(next_button['href'])
# create a beautifulsoup object
soup = BeautifulSoup(page.content, 'html.parser')
# get the next button
next_button = soup.find('a', class_='Next')
print('Finished scraping')

Related

issues saving data into csv during webscraping

I am having some issues while saving rows in a csv file after web scraping. I used the same notation and it worked well before on another site but now the csv file is blank. It seems python is not writing any row.
I show you my code, thanks in advance:
import requests
from bs4 import BeautifulSoup
import csv
import lxml
html_page = requests.get('https://www.scrapethissite.com/pages/forms/?page_num=1').text
soup = BeautifulSoup(html_page, 'lxml')
# get the number of pages (it might change in the future as the data is updated)
pagenum = soup.find('ul', {'class': 'pagination'})
n = pagenum.findAll('li')[-2].find('a')['href'].split('=')[1]
# now we convert the value of the page in a range so that we can loop over it
page = range(1, int(n) + 1)
print(page)
with open('HockeyLeague.csv', 'w') as f:
csv_writer = csv.writer(f)
csv_writer.writerow(['team_name', 'year', 'wins', 'losses', 'win_perc', 'goal_for', 'goal_against'])
for p in page:
html_page = requests.get(f'https://www.scrapethissite.com/pages/forms/?page_num={p}&per_page=25').text
soup = BeautifulSoup(html_page, 'lxml')
table = soup.find('table', {'class': 'table'})
for row in table.findAll('tr', {'class': 'team'}):
# getting the wanted variables:
team_name = row.find('td', {'class': 'name'}).text
year = row.find('td', {'class': 'year'}).text
wins = row.find('td', {'class': 'wins'}).text
losses = row.find('td', {'class': 'losses'}).text
goal_for = row.find('td', {'gf'}).text
goal_against = row.find('td', {'ga'}).text
try:
win_perc = row.find('td', {'pct text-success'}).text
except:
win_perc = row.find('td', {'pct text-danger'}).text
# write the data in the csv file we created at the beginning
csv_writer.writerow([team_name, year, wins, losses, win_perc, goal_for, goal_against])
Cause script in general is working these are just some things you should keep in mind:
I would recommend opening the file with newline='' on all platforms
to disable universal newlines translation and encoding='utf-8' to
be sure you are working on the "correct" one:
with open('HockeyLeague.csv', 'w', newline='', encoding='utf-8') as f:
...
.strip() your texts or use .get_text(strip=True) to get a clean
output and avoid linebreaks you do not wont.
team_name = row.find('td', {'class': 'name'}).text.strip()
year = row.find('td', {'class': 'year'}).text.strip()
...
In newer code avoid old syntax findAll() instead use find_all() -
For more take a minute to check
docs
Alternative Example
Uses a while loop the checks the "Next Button" and extract its url, also stripped_strings to extract the texts from each row:
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.scrapethissite.com/pages/forms/'
with open('HockeyLeague.csv', 'w', newline='', encoding='utf-8') as f:
csv_writer = csv.writer(f)
csv_writer.writerow(['team_name', 'year', 'wins', 'losses', 'win_perc', 'goal_for', 'goal_against'])
while True:
html_page = requests.get(url).text
soup = BeautifulSoup(html_page)
for row in soup.find_all('tr', {'class': 'team'}):
# write the data in the csv file we created at the beginning
csv_writer.writerow(list(row.stripped_strings)[:-1])
if soup.select_one('.pagination a[aria-label="Next"]'):
url = 'https://www.scrapethissite.com'+soup.select_one('.pagination a[aria-label="Next"]').get('href')
else:
break
Output
team_name,year,wins,losses,win_perc,goal_for,goal_against
Boston Bruins,1990,44,24,0.55,299,264
Buffalo Sabres,1990,31,30,0.388,292,278
Calgary Flames,1990,46,26,0.575,344,263
Chicago Blackhawks,1990,49,23,0.613,284,211
Detroit Red Wings,1990,34,38,0.425,273,298
Edmonton Oilers,1990,37,37,0.463,272,272
...

Need help in scraping information from multiple webpages and import to csv file in tabular form - Python

I have been working on webscraping the infobox information on Wikipedia. This is the following code that I have been using:
import requests
import csv
from bs4 import BeautifulSoup
URL = ['https://en.wikipedia.org/wiki/Workers_Credit_Union','https://en.wikipedia.org/wiki/San_Diego_County_Credit_Union',
'https://en.wikipedia.org/wiki/USA_Federal_Credit_Union','https://en.wikipedia.org/wiki/Commonwealth_Credit_Union',
'https://en.wikipedia.org/wiki/Center_for_Community_Self-Help','https://en.wikipedia.org/wiki/ESL_Federal_Credit_Union',
'https://en.wikipedia.org/wiki/State_Employees_Credit_Union','https://en.wikipedia.org/wiki/United_Heritage_Credit_Union']
for url in URL:
headers=[]
rows=[]
response = requests.get(url)
soup = BeautifulSoup(response.text,'html.parser')
table = soup.find('table',class_ ='infobox')
credit_union_name= soup.find('h1', id = "firstHeading")
header_tags = table.find_all('th')
headers = [header.text.strip() for header in header_tags]
data_rows = table.find_all('tr')
for row in data_rows:
value = row.find_all('td')
beautified_value = [dp.text.strip() for dp in value]
if len(beautified_value) == 0:
continue
rows.append(beautified_value)
rows.append("")
rows.append([credit_union_name.text.strip()])
rows.append([url])
with open(r'credit_unions.csv','a+',newline="") as output:
writer=csv.writer(output)
writer.writerow(headers)
writer.writerow(rows)
However, I checked the csv file and information is not being presented in tabular form. The scraped elements are being stored in nested lists instead of a singular list. I need the scraped information of each URL to be stored in a singular list and print the list in csv file in tabular form with the headings. Need help regarding this.
The infoboxes have different structures and labels. So I think the best way to solve this is to use dicts and a DictWriter.
import requests
import csv
from bs4 import BeautifulSoup
URL = ['https://en.wikipedia.org/wiki/Workers_Credit_Union',
'https://en.wikipedia.org/wiki/San_Diego_County_Credit_Union',
'https://en.wikipedia.org/wiki/USA_Federal_Credit_Union',
'https://en.wikipedia.org/wiki/Commonwealth_Credit_Union',
'https://en.wikipedia.org/wiki/Center_for_Community_Self-Help',
'https://en.wikipedia.org/wiki/ESL_Federal_Credit_Union',
'https://en.wikipedia.org/wiki/State_Employees_Credit_Union',
'https://en.wikipedia.org/wiki/United_Heritage_Credit_Union']
csv_headers = set()
csv_rows = []
for url in URL:
csv_row = {}
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
credit_union_name = soup.find('h1', id="firstHeading")
table = soup.find('table', class_='infobox')
data_rows = table.find_all('tr')
for data_row in data_rows:
label = data_row.find('th')
value = data_row.find('td')
if label is None or value is None:
continue
beautified_label = label.text.strip()
beautified_value = value.text.strip()
csv_row[beautified_label] = beautified_value
csv_headers.add(beautified_label)
csv_row["name"] = credit_union_name.text.strip()
csv_row["url"] = url
csv_rows.append(csv_row)
with open(r'credit_unions.csv', 'a+', newline="") as output:
headers = ["name", "url"]
headers += sorted(csv_headers)
writer = csv.DictWriter(output, fieldnames=headers)
writer.writeheader()
writer.writerows(csv_rows)

Pagnation in Python (Beautifulsoup)

I'm new to Python and whilst I have managed to scrape the first page I'm stuck as to how to move through the pagnation. Here's my code so far for the first page:
import requests
import csv
from bs4 import BeautifulSoup
page = requests.get("http://books.toscrape.com/")
soup = BeautifulSoup(page.content, 'html.parser')
books = soup.find("section")
book_list = books.find_all(class_="product_pod")
csvfile = csv.writer(open('books.csv', 'w', newline=''))
csvfile.writerow(['Title', 'Price', 'Stock', 'Link'])
for book in book_list:
price = book.find(class_="price_color").get_text()
title = book.select_one('a img')['alt']
stock = book.find('p',attrs={"class":"instock availability"}).get_text().strip()
link = "http://books.toscrape.com/" + book.find('a')['href']
csvfile.writerow([title, price, stock, link])
One solution is to "hardcode" number of pages (in this case, 50). Or you could get next page "dynamically". That means search for "Next" button on current page, if exists, get the link and continue scraping. If "Next" button doesn't exist, stop running.
For example:
import csv
import requests
from bs4 import BeautifulSoup
page = requests.get("http://books.toscrape.com/")
with open('books.csv', 'w') as f_out: # books.csv
csvfile = csv.writer(f_out)
csvfile.writerow(['Title', 'Price', 'Stock', 'Link'])
current_page = 1
current_url = "http://books.toscrape.com/"
while True:
print('Processing page {}...'.format(current_page))
soup = BeautifulSoup(page.content, 'html.parser')
books = soup.find("section")
book_list = books.find_all(class_="product_pod")
for book in book_list:
price = book.find(class_="price_color").get_text()
title = book.select_one('a img')['alt']
stock = book.find('p',attrs={"class":"instock availability"}).get_text().strip()
link = "http://books.toscrape.com/" + book.find('a')['href']
csvfile.writerow([title, price, stock, link])
# is there "Next" button
next_link = soup.select_one('li.next > a')
if not next_link:
# no, we're on last page, exit
break
# yes, continue:
current_url = current_url.rsplit('/', maxsplit=1)[0] + '/' + next_link['href']
page = requests.get(current_url)
current_page += 1
Produces:

Is it possible to scrape an attribute inside a span?

So I want to scrape some phone numbers from a site. The only problem is that they are hidden behind a click. I can't go and click all of them to make them scrape-able so I wanted to ask if there is any way to get them from the 'data-phone' attribute inside the span tag.
I tried to use data_='data-phone' but that doesn't work.
from bs4 import BeautifulSoup
import requests
import csv
source = requests.get('https://software-overzicht.nl/amersfoort?page=1').text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('cms_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['title, location'])
for number in soup.find_all('span', data_='data-phone'):
print(number)
for info in soup.find_all('div', class_='company-info-top'):
title = info.a.text
location = info.p.text
csv_writer.writerow([title, location])
csv_file.close()
change
for number in soup.find_all('span', data_='data-phone'):
print(number)
to
for number in soup.find_all('span', class_='phone'):
print(number['data-phone'])
Output:
0334226800
0878739737
0334558584
0334798200
0334720311
0334677050
0334554948
0334535384
0337767840
0334560292
0626214363
0334559065
0334506506
0620423525
0334556166
0332012581
0334557485
0334946111
0334536200
0334545111
0334545430
0337851805
033-4721544
06-26662490
To incorporate that into your csv:
from bs4 import BeautifulSoup
import requests
import csv
with open('C:/cms_scrape.csv','w', newline='') as f:
csv_writter = csv.writer(f)
csv_writter.writerow(['naambedrijf', 'adress', 'phone'])
for page in range(1, 22):
url = 'https://software-overzicht.nl/amersfoort?page={}'.format(page)
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
for search in soup.find_all('div', class_='company-info-top'):
title = search.a.text.strip()
adress = search.p.text.strip()
try:
phone = search.find('span', {'class':'phone'})['data-phone']
except:
phone = 'N/A'
print(title)
csv_writter.writerow([title,adress,phone])

Writing to scraped links to a CSV file using Python3

I have scraped a website for html links and have a result of about 500 links. When I try to write them to a csv file, I do not get the list only the base page.
Here is my code:
import requests
from bs4 import BeautifulSoup
import csv
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
link_set = set()
for link in soup.find_all('a'):
web_links = link.get("href")
print(web_links)
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
writer.writerow([web_links])
csvfile.close()
I only get two lines in my csv file. The header 'Links' and www.census.gov. I have tried making it different by add another for loop in the csv writer area, but I get similar results.
for link in soup.find_all('a'):
web_links = link.get('href')
abs_url = join(page, web_links)
print(abs_url)
if abs_url and abs_url not in link_set:
writer.write(str(abs_url) + "\n")
link_set.add(abs_url)
It seems the 'web_links' definition should be where I put all the links into the csv file, but no dice. Where am I making my mistake?
In your code, you are writing two row in csv i.e.
writer.writerow(['Links'])
writer.writerow([web_links])
Here web_links is the last instance of retrieved href value.
I don't see the use of set instance. You can print and write in the csv without using set instance in following way :
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
for link in soup.find_all('a'):
web_links = link.get("href")
if web_links:
print(web_links)
writer.writerow([web_links])
csvfile.close()
You have never added the scrapped links to your set():
import requests
from bs4 import BeautifulSoup
import csv
page = requests.get('https://www.census.gov/programs-surveys/popest.html')
print(page.status_code)
soup = BeautifulSoup(page.text, 'html.parser')
link_set = set()
for link in soup.find_all('a'):
web_links = link.get("href")
print(web_links)
link_set.add(web_links)
csvfile = open('code_python.csv', 'w+', newline='')
writer = csv.writer(csvfile)
writer.writerow(['Links'])
for link in link_set:
writer.writerow([link])
csvfile.close()

Categories

Resources