Pagnation in Python (Beautifulsoup) - python

I'm new to Python and whilst I have managed to scrape the first page I'm stuck as to how to move through the pagnation. Here's my code so far for the first page:
import requests
import csv
from bs4 import BeautifulSoup
page = requests.get("http://books.toscrape.com/")
soup = BeautifulSoup(page.content, 'html.parser')
books = soup.find("section")
book_list = books.find_all(class_="product_pod")
csvfile = csv.writer(open('books.csv', 'w', newline=''))
csvfile.writerow(['Title', 'Price', 'Stock', 'Link'])
for book in book_list:
price = book.find(class_="price_color").get_text()
title = book.select_one('a img')['alt']
stock = book.find('p',attrs={"class":"instock availability"}).get_text().strip()
link = "http://books.toscrape.com/" + book.find('a')['href']
csvfile.writerow([title, price, stock, link])

One solution is to "hardcode" number of pages (in this case, 50). Or you could get next page "dynamically". That means search for "Next" button on current page, if exists, get the link and continue scraping. If "Next" button doesn't exist, stop running.
For example:
import csv
import requests
from bs4 import BeautifulSoup
page = requests.get("http://books.toscrape.com/")
with open('books.csv', 'w') as f_out: # books.csv
csvfile = csv.writer(f_out)
csvfile.writerow(['Title', 'Price', 'Stock', 'Link'])
current_page = 1
current_url = "http://books.toscrape.com/"
while True:
print('Processing page {}...'.format(current_page))
soup = BeautifulSoup(page.content, 'html.parser')
books = soup.find("section")
book_list = books.find_all(class_="product_pod")
for book in book_list:
price = book.find(class_="price_color").get_text()
title = book.select_one('a img')['alt']
stock = book.find('p',attrs={"class":"instock availability"}).get_text().strip()
link = "http://books.toscrape.com/" + book.find('a')['href']
csvfile.writerow([title, price, stock, link])
# is there "Next" button
next_link = soup.select_one('li.next > a')
if not next_link:
# no, we're on last page, exit
break
# yes, continue:
current_url = current_url.rsplit('/', maxsplit=1)[0] + '/' + next_link['href']
page = requests.get(current_url)
current_page += 1
Produces:

Related

Scraping only returns header not details

I'm a newbie to python and am just teaching myself how to code and scrape data, hoping someone can explain what Im doing wrong or why from the following script do I only get the headers, but no data is inserted into the text file?
Is it because its returning none or empty data fields from the scrape? or am I missing something in my logic
Not getting any errors from the code it would seem see idle out image below
code
# import necessary libraries
import requests
from bs4 import BeautifulSoup
import csv
import datetime
# get the current date
date = datetime.datetime.now().strftime("%Y%m%d")
# create the output file
filename = 'C:/Users/AJS2/Documents/datafiles_test/' + date + '.txt'
with open(filename, 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
writer.writerow(['url', 'name', 'address', 'phone', 'email'])
# define the base url
base_url = "http://www.wedding-planners.com/index.cfm"
# get the page
page = requests.get(base_url)
# create a beautifulsoup object
soup = BeautifulSoup(page.content, 'html.parser')
# get the next button
next_button = soup.find('a', class_='Next')
# set the counter
counter = 0
# loop through the pages
while next_button and counter < 50:
# get the list of wedding planners
wedding_planners = soup.find_all('div', class_='plannerName')
# loop through the list of wedding planners
for planner in wedding_planners:
# get the url
url = planner.a['href']
# get the page
page = requests.get(url)
# create a beautifulsoup object
soup = BeautifulSoup(page.content, 'html.parser')
# get the name
name = soup.find('h1', class_='head1').text
# get the address
address = soup.find('span', class_='address').text
# get the phone
phone = soup.find('span', class_='phone').text
# get the email
email = soup.find('span', class_='email').text
# save the data
with open(filename, 'a', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
writer.writerow([url, name, address, phone, email])
# increment the counter
counter += 1
# get the next page
page = requests.get(next_button['href'])
# create a beautifulsoup object
soup = BeautifulSoup(page.content, 'html.parser')
# get the next button
next_button = soup.find('a', class_='Next')
print('Finished scraping')

Create a specific Web Scraper

I am making the effort to learn to scrape in Python and in this case my idea is to make a tool that obtains data from a web page. I have a problem in proposing the "for" to go through the page and collect the data of each box (item) as they are:
IDoffer
List
Title
Location
content
phone
It is not a task, it is my own initiative but I am not moving forward for which I thank you for your help.
Here is what I have of code:
from bs4 import BeautifulSoup
import requests
URL_BASE = "https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina="
MAX_PAGES = 2
counter = 0
for i in range(0, MAX_PAGES):
#Building the URL
if i > 0:
url = "%s%d" % (URL_BASE, i)
else:
url = URL_BASE
#We make the request to the web
req = requests.get(url)
#We check that the request returns a Status Code = 200
statusCode = req.status_code
if statusCode == 200:
#We pass the HTML content of the web to a BeautifulSoup () object
html = BeautifulSoup(req.text, "html.parser")
#We get all the divs where the inputs are
entradas_IDoffer = html.find_all('div', {'class': 'aditem-header'})
#We go through all the inputs and extract info
for entrada1 in entradas_IDoffer:
#THIS ARE SOME ATTEMPS
#Title = entrada.find('div', {'class': 'aditem-detail-title'}).getText()
#location = entrada.find('div', {'class': 'list-location-region'}).getText()
#content = entrada.find('div', {'class': 'tx'}).getText()
#phone = entrada.find('div', {'class': 'telefonos'}).getText()
#Offer Title
entradas_Title = html.find_all('div', {'class': 'aditem-detail'})
for entrada2 in entradas_Title:
counter += 1
Title = entrada2.find('a', {'class': 'aditem-detail-title'}).getText()
counter += 1
IDoffer = entrada1.find('div', {'class': 'x5'}).getText()
#Location
#entradas_location = html.find_all('div', {'class': 'aditem-detail'})
#for entrada4 in entradas_location:
# counter += 1
# location = entrada4.find('div', {'class': 'list-location-region'}).getText()
#Offer content
#entradas_content = html.find_all('div', {'class': 'aditem-detail'})
#for entrada3 in entradas_content:
# counter += 1
# content = entrada3.find('div', {'class': 'tx'}).getText()
print("%d - %s \n%s\n%s" % (counter, IDoffer.strip(),url,Title))
else:
try:
r = requests.head(req)
print(r.status_code)
except requests.ConnectionError:
print("failed to connect")
break
#If the page no longer exists and it gives me a 400
Correct entradas_IDoffer,
entradas_IDoffer = html.find_all("div", class_="aditem CardTestABClass")
Title is located under "a" tag not "div"
title = entrada.find("a", class_="aditem-detail-title").text.strip()
location = entrada.find("div", class_="list-location-region").text.strip()
content = entrada.find("div", class_="tx").text.strip()
do like this for other data
they might be loading Phone number with javascript so you may not able to get that with bs4, you can get that using selenium.
You wrote very lengthy code to loop through multiple pages, just do this to go through page 1 and 2 using range. Put url in formatted string.
for page in range(1, 3):
url = f'https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina={page}'
Full code:
import requests
from bs4 import BeautifulSoup
for page in range(1, 5):
url = f'https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina={page}'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
entradas_IDoffer = soup.find_all("div", class_="aditem CardTestABClass")
for entrada in entradas_IDoffer:
title = entrada.find("a", class_="aditem-detail-title").text.strip()
ID = entrada.find("div", class_="x5").text.strip()
location = entrada.find("div", class_="list-location-region").text.strip()
content = entrada.find("div", class_="tx").text.strip()
print(title, ID, location, content)

I am trying to parse data from all pages. Only the first page is parsed

I am trying to parse data from all pages. Parsing ends after the first page. What could be the problem?
I use pagination with the use of a regular expression.
The first page of the site and others differ in the html code, so I have to create two different functions main_1 and main_2 for the first and other pages.
If you try to run only the main_2 function, nothing will work. .CSV file will not be created.
help me please.
import requests
from bs4 import BeautifulSoup
import csv
import re
def get_html(url):
r = requests.get(url)
if r.ok:
return r.text
print(r.status_code)
def writer_csv(data):
with open('tesr.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow((data['name'], data['url'], data['price']))
def get_data_page(html):
soup = BeautifulSoup(html, 'lxml')
trs = soup.find_all('tr', class_='cmc-table-row')
for tr in trs:
tds = tr.find_all('td')
try:
name = tds[1].find('a', class_='cmc-link').text.strip()
except:
name = ''
try:
url = 'https://coinmarketcap.com' + str(tds[1].find('a', class_='cmc-link').get('href'))
except:
url = ''
try:
price = tr.find('td', class_='cmc-table__cell--sort-by__price').find('a').text.strip().replace('$', '')
except:
price = ''
data = {'name': name,
'url': url,
'price': price}
writer_csv(data)
def main_1():
url_1 = 'https://coinmarketcap.com/'
get_data_page(get_html(url_1))
def main_2():
url_2 = 'https://coinmarketcap.com/2/'
while True:
get_data_page(get_html(url_2))
soup = BeautifulSoup(get_html(url_2), 'lxml')
try:
pattern = 'Next '
url_2 = 'https://coinmarketcap.com' + str(soup.find('ul', class_='pagination').find('a', text=re.compile(pattern)).get('href'))
except:
break
main_1()
main_2()

Is it possible to scrape an attribute inside a span?

So I want to scrape some phone numbers from a site. The only problem is that they are hidden behind a click. I can't go and click all of them to make them scrape-able so I wanted to ask if there is any way to get them from the 'data-phone' attribute inside the span tag.
I tried to use data_='data-phone' but that doesn't work.
from bs4 import BeautifulSoup
import requests
import csv
source = requests.get('https://software-overzicht.nl/amersfoort?page=1').text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('cms_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['title, location'])
for number in soup.find_all('span', data_='data-phone'):
print(number)
for info in soup.find_all('div', class_='company-info-top'):
title = info.a.text
location = info.p.text
csv_writer.writerow([title, location])
csv_file.close()
change
for number in soup.find_all('span', data_='data-phone'):
print(number)
to
for number in soup.find_all('span', class_='phone'):
print(number['data-phone'])
Output:
0334226800
0878739737
0334558584
0334798200
0334720311
0334677050
0334554948
0334535384
0337767840
0334560292
0626214363
0334559065
0334506506
0620423525
0334556166
0332012581
0334557485
0334946111
0334536200
0334545111
0334545430
0337851805
033-4721544
06-26662490
To incorporate that into your csv:
from bs4 import BeautifulSoup
import requests
import csv
with open('C:/cms_scrape.csv','w', newline='') as f:
csv_writter = csv.writer(f)
csv_writter.writerow(['naambedrijf', 'adress', 'phone'])
for page in range(1, 22):
url = 'https://software-overzicht.nl/amersfoort?page={}'.format(page)
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
for search in soup.find_all('div', class_='company-info-top'):
title = search.a.text.strip()
adress = search.p.text.strip()
try:
phone = search.find('span', {'class':'phone'})['data-phone']
except:
phone = 'N/A'
print(title)
csv_writter.writerow([title,adress,phone])

Python write of scraping data to csv file

I wrote simple code which scrape data from website but i'm struggling to save all rows to csv file. Finished script save only one row - it's last occurance in loop.
def get_single_item_data(item_url):
f= csv.writer(open("scrpe.csv", "wb"))
f.writerow(["Title", "Company", "Price_netto"])
source_code = requests.get(item_url)
soup = BeautifulSoup(source_code.content, "html.parser")
for item_name in soup.find_all('div', attrs={"id" :'main-container'}):
title = item_name.find('h1').text
prodDesc_class = item_name.find('div', class_='productDesc')
company = prodDesc_class.find('p').text
company = company.strip()
price_netto = item_name.find('div', class_="netto").text
price_netto = price_netto.strip()
#print title, company, ,price_netto
f.writerow([title.encode("utf-8"), company, price_netto, ])
Important is to save data to concurrent columns
#PadraicCunningham This is my whole script:
import requests
from bs4 import BeautifulSoup
import csv
url_klocki = "http://selgros24.pl/Dla-dzieci/Zabawki/Klocki-pc1121.html"
r = requests.get(url_klocki)
soup = BeautifulSoup(r.content, "html.parser")
def main_spider(max_page):
page = 1
while page <= max_page:
url = "http://selgros24.pl/Dla-dzieci/Zabawki/Klocki-pc1121.html"
source_code = requests.get(url)
soup = BeautifulSoup(source_code.content, "html.parser")
for link in soup.find_all('article', class_='small-product'):
url = "http://www.selgros24.pl"
a = link.findAll('a')[0].get('href')
href = url + a
#print href
get_single_item_data(href)
page +=1
def get_single_item_data(item_url):
f= csv.writer(open("scrpe.csv", "wb"))
f.writerow(["Title", "Comapny", "Price_netto"])
source_code = requests.get(item_url)
soup = BeautifulSoup(source_code.content, "html.parser")
for item_name in soup.find_all('div', attrs={"id" :'main-container'}):
title = item_name.find('h1').text
prodDesc_class = item_name.find('div', class_='productDesc')
company = prodDesc_class.find('p').text
company = company.strip()
price_netto = item_name.find('div', class_="netto").text
price_netto = price_netto.strip()
print title, company, price_netto
f.writerow([title.encode("utf-8"), company, price_netto])
main_spider(1)
The problem is that you are opening the output file in get_single_item_data, and it is getting closed when that function returns and f goes out of scope.
You want to pass an open file in to get_single_item_data so multiple rows will be written.

Categories

Resources