Create a specific Web Scraper - python

I am making the effort to learn to scrape in Python and in this case my idea is to make a tool that obtains data from a web page. I have a problem in proposing the "for" to go through the page and collect the data of each box (item) as they are:
IDoffer
List
Title
Location
content
phone
It is not a task, it is my own initiative but I am not moving forward for which I thank you for your help.
Here is what I have of code:
from bs4 import BeautifulSoup
import requests
URL_BASE = "https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina="
MAX_PAGES = 2
counter = 0
for i in range(0, MAX_PAGES):
#Building the URL
if i > 0:
url = "%s%d" % (URL_BASE, i)
else:
url = URL_BASE
#We make the request to the web
req = requests.get(url)
#We check that the request returns a Status Code = 200
statusCode = req.status_code
if statusCode == 200:
#We pass the HTML content of the web to a BeautifulSoup () object
html = BeautifulSoup(req.text, "html.parser")
#We get all the divs where the inputs are
entradas_IDoffer = html.find_all('div', {'class': 'aditem-header'})
#We go through all the inputs and extract info
for entrada1 in entradas_IDoffer:
#THIS ARE SOME ATTEMPS
#Title = entrada.find('div', {'class': 'aditem-detail-title'}).getText()
#location = entrada.find('div', {'class': 'list-location-region'}).getText()
#content = entrada.find('div', {'class': 'tx'}).getText()
#phone = entrada.find('div', {'class': 'telefonos'}).getText()
#Offer Title
entradas_Title = html.find_all('div', {'class': 'aditem-detail'})
for entrada2 in entradas_Title:
counter += 1
Title = entrada2.find('a', {'class': 'aditem-detail-title'}).getText()
counter += 1
IDoffer = entrada1.find('div', {'class': 'x5'}).getText()
#Location
#entradas_location = html.find_all('div', {'class': 'aditem-detail'})
#for entrada4 in entradas_location:
# counter += 1
# location = entrada4.find('div', {'class': 'list-location-region'}).getText()
#Offer content
#entradas_content = html.find_all('div', {'class': 'aditem-detail'})
#for entrada3 in entradas_content:
# counter += 1
# content = entrada3.find('div', {'class': 'tx'}).getText()
print("%d - %s \n%s\n%s" % (counter, IDoffer.strip(),url,Title))
else:
try:
r = requests.head(req)
print(r.status_code)
except requests.ConnectionError:
print("failed to connect")
break
#If the page no longer exists and it gives me a 400

Correct entradas_IDoffer,
entradas_IDoffer = html.find_all("div", class_="aditem CardTestABClass")
Title is located under "a" tag not "div"
title = entrada.find("a", class_="aditem-detail-title").text.strip()
location = entrada.find("div", class_="list-location-region").text.strip()
content = entrada.find("div", class_="tx").text.strip()
do like this for other data
they might be loading Phone number with javascript so you may not able to get that with bs4, you can get that using selenium.
You wrote very lengthy code to loop through multiple pages, just do this to go through page 1 and 2 using range. Put url in formatted string.
for page in range(1, 3):
url = f'https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina={page}'
Full code:
import requests
from bs4 import BeautifulSoup
for page in range(1, 5):
url = f'https://www.milanuncios.com/ofertas-de-empleo-en-madrid/?dias=3&demanda=n&pagina={page}'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
entradas_IDoffer = soup.find_all("div", class_="aditem CardTestABClass")
for entrada in entradas_IDoffer:
title = entrada.find("a", class_="aditem-detail-title").text.strip()
ID = entrada.find("div", class_="x5").text.strip()
location = entrada.find("div", class_="list-location-region").text.strip()
content = entrada.find("div", class_="tx").text.strip()
print(title, ID, location, content)

Related

anyone please guide me how can i do web scarping multiple pages of booking.com -

This is the link url
url = 'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_c
Hotel_name = doc.find_all("div",{'class' : "fcab3ed991 a23c043802"})
this gives me the result of all hotel names in page number, 1, but how can I get the hotel names of all the pages?
I've tried this
import requests
from bs4 import BeautifulSoup
# Initialize the page number
page_number = 0
while True:
# Increment the page number
page_number += 1
# Make the GET request to the URL
url = f"https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={page_number*15}"
response = requests.get(url)
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the hotel information
hotels = soup.find_all('div', {'class' : "fcab3ed991 a23c043802"})
if not hotels:
break
for hotel in hotels:
price = hotel.find('div', {' data-testid="title'}).text
print(f"{price}")
but it gives me an empty list as an output.
Avoid selecting elements by classes that looks highly dynamic and use HTML structure instead. Check the number of total results and use it in range() to iterate the results.
Example
import requests, re
from bs4 import BeautifulSoup
data = []
soup = BeautifulSoup(
requests.get('https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15',
headers={'user-agent':'some agent'}
).text)
num_results = int(re.search(r'\d+',soup.select_one('div:has(+[data-testid="pagination"])').text).group(0))
for i in range(0,int(num_results/25)):
soup = BeautifulSoup(
requests.get(f'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={int(i*25)}',
headers={'user-agent':'some agent'}
).text
)
data.extend([e.select_one('[data-testid="title"]').text for e in soup.select('[data-testid="property-card"]')])
data

Scraping multiple pages with bs4 Beautiful Soup - only scrapes the first page

*** My code is for practice only!
I'm trying to scrape the names and teams that each player in FPL from their website https://www.premierleague.com/ and I got some problems with the code.
The problem is it's only getting the page with the '-1' in the end of the url, wihch I haven't even inculded in my pages list!
there isn't any logic with the pages - the basic url is https://www.premierleague.com/players?se=363&cl= while the number after the '=' seems to be random. so I created a list of the numbers and added it to the url with a for loop:
my code:
import requests
from bs4 import BeautifulSoup
import pandas
plplayers = []
pl_url = 'https://www.premierleague.com/players?se=363&cl='
pages_list = ['1', '2', '131', '34']
for page in pages_list:
r = requests.get(pl_url + page)
c = r.content
soup = BeautifulSoup(c, 'html.parser')
player_names = soup.find_all('a', {'class': 'playerName'})
for x in player_names:
player_d = {}
player_teams = []
player_href = x.get('href')
player_info_url = 'https://www.premierleague.com/' + player_href
player_r = requests.get(player_info_url, headers=headers)
player_c = player_r.content
player_soup = BeautifulSoup(player_c, 'html.parser')
team_tag = player_soup.find_all('td', {'class': 'team'})
for team in team_tag:
try:
team_name = team.find('span', {'class': 'long'}).text
if '(Loan)' in team_name:
team_name.replace(' (Loan) ', '')
if team_name not in player_teams:
player_teams.append(team_name)
player_d['NAME'] = x.text
player_d['TEAMS'] = player_teams
except:
pass
plplayers.append(player_d)
df = pandas.DataFrame(plplayers)
df.to_csv('plplayers.txt')
I would comment this but I'm new and don't have enough reputation this so I'll have to keep it in an answer.
It looks like when you made a request to store in player_r you specified a headers parameter but didn't actually make a headers variable.
If you replace player_r = requests.get(player_info_url, headers=headers)with player_r = requests.get(player_info_url) instead, your code should run perfectly. At least, it did on my machine.

how to add scraped data in csv file?

I am new to web scraping.I am scraping data from a website where i scraped first page href and then i go to each href and find the 'p tag' in class 'address-data'.i want to store one url 'p tag' data in one row and second url 'p tag' tag in second row.My data is appended in 'myUrl'.I want save data in csv file eg, address,longitudelatitude,phone,email then new line starts.
here is my code:
from bs4 import BeautifulSoup
import requests
import csv
myUrl=[]
urls = ["http://www.shaditayari.pk/s&category=326&location=266&a=true&paged{}".format(i) for i in range(1, 10)] # make a url list and iterate over it
for url in urls:
r = requests.get(url)
print('idr1')
soup = BeautifulSoup(r.text, "html.parser")
for link in soup.find_all('a', {'main-link'}):
iurl=link.get('href')
r = requests.get(iurl)
print(iurl)
soup = BeautifulSoup(r.content, "lxml")
with open ('lhr.cv','wb') as file:
divs = soup.find_all('div',attrs={"class":"address-data"})
for div in divs:
myUrl.append(div.find('p').text)
#print(myUrl)
with open ('lhr.cv','w') as file:
writer=csv.writer(file)
for row in myUrl:
writer.writerow(row)
expected output:
9 Fane RoadŲŒ Lahore 54000, Pakistan|1.561381309140028|74.31484723624567|042-37363901-9|gm#bestwesternlahore.com/sales#bestwesternlahore.com/ reservations#bestwesternlahore.com
1/E-3, Main Boulevard Gulberg III, Lahore|31.525700029363|74.34930089283|0305-2960614|https://www.facebook.com/pages/Zauk-Banquet-Hall/204612846290857
I've written this in Python 2 and using xpaths (because I think they're cleaner and simpler to use for webscraping), but this code will get you your list of links:
#Load required libraries
import requests
from lxml import html
import pandas as pd
#Create base URL
url = "http://www.shaditayari.pk/?s&post_type=ait-item&a=true&paged="
#First, we want to work out the number of pages to scrape. We load any page and get the largest page number
page = requests.get(url+str(1))
tree = html.fromstring(page.content)
no_pages = tree.xpath("//nav/a[last()]/text()")[0] #This comes out as a list of two - we only want the first one
#Next, we want to scrape the links to each page with the address
links = []
names = []
for i in range(1,int(no_pages)+1):
page = requests.get(url+str(i))
tree = html.fromstring(page.content)
page_links = tree.xpath("//div[#class = 'item-title']/a/#href")
page_names = tree.xpath("//a/h3/text()")
links = links + page_links
names = names + page_names
print i
address links = {"Name": names,
"URL": links}
pd.DataFrame.to_csv(u"address_links.csv")
This code needs completing, with the append, the dictionary completion, and a line to create a CSV, but it will get your details:
address_list = []
latitude_list = []
longitude_list = []
telephone_list = []
email_list = []
webpage_list = []
counter = 0
for url in address_links["URL"]:
page = requests.get("http://www.shaditayari.pk/businesses/rizwan-beyg/")
tree = html.fromstring(page.content)
address = tree.xpath("//div[#itemprop = 'streetAddress']/p/text()")
if len(address) == 0:
address == ""
else:
address == address[0]
latitude = tree.xpath("//p/meta[#itemprop = 'latitude']/#content")
if len(latitude) == 0:
latitude = ""
else:
latitude = latitude[0]
longitude = tree.xpath("//p/meta[#itemprop = 'latitude']/#content")
if len(longitude) == 0:
longitude = ""
else:
longitude = longitude[0]
telephone = tree.xpath("//a[#class = 'phone']/text()")
if len(telephone) == 0:
telephone = ""
else:
telephone = telephone[0]
email = tree.xpath("//a[#itemprop = 'email']/text()")
if len(email) == 0:
email = ""
else:
email = email[0]
webpage = tree.xpath("//a[#itemprop = 'url']/#href")
if len(webpage) == 0:
webpage = ""
else:
webpage = webpage[0]
address_list.append(address)
#continue for others
counter+=1
print counter
address_details = {"Name": names,
"URL": links,
"Address": address_list,
#continue for others
}
You might need to add in some unicode encoding before you turn it into a CSV. That's answered here.

How can I make the output with pair of list : content in my python code?

I have been developing a python web-crawler for this website. I made two functions, which works well as separately.
One is to collect the list of stocks and
Another is to collect the content data of each list.
I would like to make the output of my code with pairs of
"list#1/content#1",
"list#2/content#2",
"list#3/content#3",
What needs to be modified in my code in order to achieve this?
Thanks.
from bs4 import BeautifulSoup
import urllib.request
CAR_PAGE_TEMPLATE = "http://www.bobaedream.co.kr/cyber/CyberCar.php?gubun=I&page="
BASE_PAGE = 'http://www.bobaedream.co.kr'
def fetch_post_list():
for i in range(20,21):
URL = CAR_PAGE_TEMPLATE + str(i)
res = urllib.request.urlopen(URL)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', class_='cyber')
#print ("Page#", i)
# 50 lists per each page
lists=table.find_all('tr', itemtype="http://schema.org/Article")
count=0
for lst in lists:
if lst.find_all('td')[3].find('em').text:
lst_price=lst.find_all('td')[3].find('em').text
lst_title=lst.find_all('td')[1].find('a').text
lst_link = lst.find_all('td')[1].find('a')['href']
lst_photo_url=''
if lst.find_all('td')[0].find('img'):
lst_photo_url = lst.find_all('td')[0].find('img')['src']
count+=1
else: continue
#print('#',count, lst_title, lst_photo_url, lst_link, lst_price)
return lst_link
def fetch_post_content(lst_link):
URL = BASE_PAGE + lst_link
res = urllib.request.urlopen(URL)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
#Basic Information
table = soup.find('div', class_='rightarea')
# Number, Year, Mileage, Gas Type, Color, Accident
content_table1 = table.find_all('div')[0]
dds = content_table1.find_all('dd')
for dd in dds:
car_span_t = dd.find_all('span', {'class': 't'})[0]
car_span_s = dd.find_all('span', {'class': 's'})[0]
#print(car_span_t.text, ':', car_span_s.text)
# Seller Information
content_table2 = table.find_all('div')[1]
dds2 = content_table2.find_all('dd')
for dd2 in dds2:
seller_span_t = dd.find_all('span', {'class': 't'})[0]
seller_span_s = dd.find_all('span', {'class': 's'})[0]
#print(seller_span_t.text, ':', seller_span_s.text)
return dds

How to get text from next pages using Beautifulsoup in python 3?

Im trying to get all the game outcomes for every page for a team. So far I am able to get all the opponent 1 vs opponent 2 and score outcomes. But I don't know how to get the next page to get the rest of the data. Would I find the next page and put it in a while loop ? here is the link to the team I want
http://www.gosugamers.net/counterstrike/teams/7397-natus-vincere/matches
This is what I have so far, it gets all the team matches played and score for only the first page.
def all_match_outcomes():
for match_outcomes in match_history_url():
rest_server(True)
page = requests.get(match_outcomes).content
soup = BeautifulSoup(page, 'html.parser')
team_name_element = soup.select_one('div.teamNameHolder')
team_name = team_name_element.find('h1').text.replace('- Team Overview', '')
for match_outcome in soup.select('table.simple.gamelist.profilelist tr'):
opp1 = match_outcome.find('span', {'class': 'opp1'}).text
opp2 = match_outcome.find('span', {'class': 'opp2'}).text
opp1_score = match_outcome.find('span', {'class': 'hscore'}).text
opp2_score = match_outcome.find('span', {'class': 'ascore'}).text
if match_outcome(True): # If teams have past matches
print(team_name, '%s %s:%s %s' % (opp1, opp1_score, opp2_score, opp2))
Get the last page number and iterate page by page until you hit the last page.
Complete working code:
import re
import requests
from bs4 import BeautifulSoup
url = "http://www.gosugamers.net/counterstrike/teams/7397-natus-vincere/matches"
with requests.Session() as session:
response = session.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# locate the last page link
last_page_link = soup.find("span", text="Last").parent["href"]
# extract the last page number
last_page_number = int(re.search(r"page=(\d+)$", last_page_link).group(1))
print("Processing page number 1")
# TODO: extract data
# iterate over all pages starting from page 2 (since we are already on the page 1)
for page_number in range(2, last_page_number+1):
print("Processing page number %d" % page_number)
link = "http://www.gosugamers.net/counterstrike/teams/7397-natus-vincere/matches?page=%d" % page_number
response = session.get(link)
soup = BeautifulSoup(response.content, "html.parser")
# TODO: extract data

Categories

Resources