How do you iterate over BS4 elements that has the same name? - python

It only scrapes the first table and I'm not sure on how to get it to scrape the second, they both have the same class.
from bs4 import BeautifulSoup
import requests
def getCalendarData(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
for table in soup.find_all('table',class_ = 'ms-schedule-table ms-schedule-table--your' ):
for event in table.find_all('tbody'):
Series = event.find('div',class_ = 'ms-schedule-table-item-main__title').text.strip()
Circuit = event.find('div',class_ = 'ms-schedule-table-item-main__event').text.strip()
Month = event.find('span',class_ = 'ms-schedule-table-date__month').text.strip()
Day = event.find('span',class_ = 'ms-schedule-table-date__day').text.strip()
print(Series,Circuit,Month,Day)
getCalendarData('https://www.motorsport.com/all/schedule/2022/upcoming/')

Your question is misleading, there is no second table on this page, there is only the option to load more data.
Unless you want to switch to selenium, you can also address the resource from which the data is dynamically reloaded.
for p in range(1,3,1):
getCalendarData(f'https://www.motorsport.com/all/schedule/2022/upcoming/?all_event_types=1&p={p}')
Example
A bit more generic with while-loop, to check if there is a load more button:
from bs4 import BeautifulSoup
import requests
url = 'https://www.motorsport.com/all/schedule/2022/upcoming/'
def getCalendarData(table):
for event in table.find_all('tbody'):
Series = event.find('div',class_ = 'ms-schedule-table-item-main__title').text.strip()
Circuit = event.find('div',class_ = 'ms-schedule-table-item-main__event').text.strip()
Month = event.find('span',class_ = 'ms-schedule-table-date__month').text.strip()
Day = event.find('span',class_ = 'ms-schedule-table-date__day').text.strip()
print(Series,Circuit,Month,Day)
while True:
print(f'Scraping url: {url}')
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
getCalendarData(soup.find('table',class_ = 'ms-schedule-table ms-schedule-table--your'))
if soup.select_one('[data-id="nextPage"]'):
url = 'https://www.motorsport.com/'+soup.select_one('[data-id="nextPage"]').get('href')
else:
break

Related

webscraping bus stops with beautifulsoup

I am trying to web scrape bus stop names for a given line, here is an example page for line 212 https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212. I want to have as an output two lists, one with bus stop names in one direction and the other list with another direction. (It's clearly seen on the web page). I managed to get all names in one list with
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
print(soup.prettify())
all_bus_stops = []
table = soup.find_all('a')
for element in table:
if element.get_text() in all_bus_stops:
continue
else:
all_bus_stops.append(element.get_text())
return all_bus_stops
print(download_bus_schedule('212'))
I guess the solution would be to somehow divide the soup into two parts.
You can use the bs4.element.Tag.findAll method:
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
all_bus_stops = []
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html.parser')
for s in soup.select(".holo-list"):
bus_stops = []
for f in s.findAll("li"):
if f.text not in bus_stops:
bus_stops.append(f.text)
all_bus_stops.append(bus_stops)
return all_bus_stops
print(download_bus_schedule('212'))
Output:
[['Pl.Hallera', 'Pl.Hallera', 'Darwina', 'Namysłowska', 'Rondo Żaba', 'Rogowska', 'Kołowa', 'Dks Targówek', 'Metro Targówek Mieszkaniowy', 'Myszkowska', 'Handlowa', 'Metro Trocka', 'Bieżuńska', 'Jórskiego', 'Łokietka', 'Samarytanka', 'Rolanda', 'Żuromińska', 'Targówek-Ratusz', 'Św.Wincentego', 'Malborska', 'Ch Targówek'],
['Ch Targówek', 'Ch Targówek', 'Malborska', 'Św.Wincentego', 'Targówek-Ratusz', 'Żuromińska', 'Gilarska', 'Rolanda', 'Samarytanka', 'Łokietka', 'Jórskiego', 'Bieżuńska', 'Metro Trocka', 'Metro Trocka', 'Metro Trocka', 'Handlowa', 'Myszkowska', 'Metro Targówek Mieszkaniowy', 'Dks Targówek', 'Kołowa', 'Rogowska', 'Rondo Żaba', '11 Listopada', 'Bródnowska', 'Szymanowskiego', 'Pl.Hallera', 'Pl.Hallera']]
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
bus_stops_1 = []
bus_stops_2 = []
directions = soup.find_all("ul", {"class":"holo-list"})
for stop in directions[0].find_all("a"):
if stop not in bus_stops_1:
bus_stops_1.append(stop.text.strip())
for stop in directions[1].find_all("a"):
if stop not in bus_stops_2:
bus_stops_2.append(stop.text.strip())
all_bus_stops = (bus_stops_1, bus_stops_2)
return all_bus_stops
print(download_bus_schedule('212')[0])
print(download_bus_schedule('212')[1])
I may have misunderstood as I do not know Polish but see if this helps.
from bs4 import BeautifulSoup
import requests
url = 'https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212'
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "html.parser")
d = {}
for h2 in soup.select('h2.holo-divider'):
d[h2.text] = []
ul = h2.next_sibling
for li in ul.select('li'):
if li.a.text not in d[h2.text]:
d[h2.text].append(li.a.text)
from pprint import pprint
pprint(d)
As all stops are encapsulated in the next un-ordered list, you could use the find_next function of bs4.
e.g.
URL = f"http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l={bus_number}"
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
directions = ["Ch Targówek","Pl.Hallera"]
result = {}
for direction in directions:
header = soup.find(text=direction)
list = header.find_next("ul")
stops_names = [stop.get_text() for stop in list]
result[direction] = stops_names
return result
Plus you might want to use f-string to format your strings as it improves reading and is less error prone.

How to get HTML of the whole page without scrolling?

import requests
import urllib.request
from bs4 import BeautifulSoup
def get_photos(nick,how_many):
url = f"https://www.picuki.com/profile/{nick}"
content = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'}).content
soup = BeautifulSoup(content,"html.parser")
images = [f["src"] for f in soup.findAll('img',class_="post-image")]
for index, image in enumerate(images, start=1):
urllib.request.urlretrieve(image, f"/Users/user/PycharmProjects/untitled1/Instagram_images/image{index}.png")
if index == how_many: break
if __name__ == "__main__":
get_photos("Username",20)
So I have this code which downloads images in png format from instagram. But problem is that this page only loads 18 images without scrolling. So if I input 18-36 I need to scroll down page one more time, if 36-54 I need to scroll down 2 times and get it's HTML. How to do it with request and is it even possible with this module?
The images are loaded with Ajax, but you can emulate the Ajax with requests module.
This script will print all image URLs found on user profile:
import requests
from bs4 import BeautifulSoup
username = 'itsdougthepug'
base_url = 'https://www.picuki.com/profile/{username}'
def get_image_urls(username):
url = base_url.format(username=username)
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
while True:
for f in soup.findAll('img',class_="post-image"):
yield f['src']
load_more_url = soup.select_one('.load-more-wrapper[data-next]')
if not load_more_url:
load_more_url = soup.select_one('.pagination-next-page-input[value]')
if load_more_url:
load_more_url = load_more_url['value']
else:
load_more_url = load_more_url['data-next']
if not load_more_url:
break
soup = BeautifulSoup(requests.get('https://www.picuki.com' + load_more_url).content, 'html.parser')
for img in get_image_urls(username):
print(img)
Prints:
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103328423_965950027183296_957866876806120724_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=100&_nc_ohc=sW8Ic2lI-4UAX_b7bkB&oh=dc42f3f625065b6fba524bd39fc29cb0&oe=5EE7819B
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103183716_3364797436946158_1962633742202963007_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=1&_nc_ohc=OjegUcacb2kAX_BGNBA&oh=92a8035ffed07e724a77617c6ff73b73&oe=5F0F1F22
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/s640x640/102951446_2650089068539996_1395066409287738000_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=1&_nc_ohc=zXDXxxtqYUkAX9_1jE3&oh=06e83257c7a2b1cfea593719a3af60d2&oe=5F0D3F32
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103290695_2721943028038123_664290938707092396_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=107&_nc_ohc=cZKGnM3wjBwAX9wsGvR&oh=132218410341a0ffc2d7d78f38904a01&oe=5F104353
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103207650_283928112789317_1081832932435688252_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=105&_nc_ohc=3XfsL50CwCoAX9k2_dN&oh=969bdf74e73466a39952957bfd8ec528&oe=5F0E2A91
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/s640x640/102546510_111827600395599_8198630171951588410_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=103&_nc_ohc=cVJqLrxo-fUAX9fBZtG&oh=8edcc8a5bf56519d0155e6d23ac514b3&oe=5F0EA104
... and so on.

How do I handle exceptions in BeautifulSoup if the element I'm looking for is not found?

I am making an http request to a website and parse its content to find some attribute values. What I need to know is how do I handle exceptions if code returns [], None or nothing at all.
What I have tried:
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup
def get_url():
s = requests.Session()
retries = Retry(total=5,
backoff_factor=10
status_forcelist=[ 500, 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
r = s.get('http://httpstat.us/500')
def find_data():
soup = BeautifulSoup(r.text, "lxml")
try:
id = soup.find('a', class_="class").get('id')
except:
print('id not found')
get_url()
Basically if the id is not to find I want to make that GET request again and try to find it.
You can apply the "look before you leap" (LBYL) principle and check the result of find() - it would return None if an element was not found. You can then put the thing into a loop and exit when you have a value, also safeguarding yourself with a loop counter limit:
RETRIES = 10
id = None
session = requests.Session()
for attempt in range(1, RETRIES + 1):
response = session.get(url)
soup = BeautifulSoup(r.text, "lxml")
element = soup.find('a', class_="class", id=True)
if element is None:
print("Attempt {attempt}. Element not found".format(attempt=attempt))
continue
else:
id = element["id"]
break
print(id)
Couple notes:
id=True was set to find only elements with the id element present. You could also do an equivalent with a CSS selector soup.select_one("a.class[id]")
Session() helps to improve performance when issuing requests to the same host multiple times. See more at Session Objects
If all you want to do is make that same request a second time, you could do something like this:
import requests
from bs4 import BeautifulSoup
def find_data(url):
found_data = False
while not found_data:
r = requests.get(url)
soup = BeautifulSoup(r.text, "lxml")
try:
id = soup.find('a', class_="class").get('id')
found_data = True
except:
pass
This puts you at risk of an infinite loop if the data really aren't there. You can do this to avoid that infinite loop:
import requests
from bs4 import BeautifulSoup
def find_data(url, attempts_before_fail=3):
found_data = False
while not found_data:
r = requests.get(url)
soup = BeautifulSoup(r.text, "lxml")
try:
id = soup.find('a', class_="class").get('id')
found_data = True
except:
attempts_before_fail -= 1
if attempts_before_fail == 0:
raise ValueError("couldn't find data after all.")

How do I create a dataframe of jobs and companies that includes hyperlinks?

I am making a function to print a list of links so I can add them to a list of companies and job titles. However, I am having difficulties navigating tag sub-contents. I am looking to list all the 'href' in 'a' in 'div' like so:
from bs4 import BeautifulSoup
import re
import pandas as pd
import requests
page = "https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html"
headers = {'User-Agent':'Mozilla/5.0'}
def get_soup():
session = requests.Session()
pageTree = session.get(page, headers=headers)
return BeautifulSoup(pageTree.content, 'html.parser')
pageSoup = get_soup()
def print_links():
"""this function scrapes the job title links"""
jobLink = [div.a for div in pageSoup.find_all('div', class_='title')]
for div in jobLink:
print(div['href'])
I am trying to make a list but my result is simply text and does not seem to be a link like so:
/pagead/clk?mo=r&ad=-6NYlbfkN0DhVAxkc_TxySVbUOs6bxWYWOfhmDTNcVTjFFBAY1FXZ2RjSBnfHw4gS8ZdlOOq-xx2DHOyKEivyG9C4fWOSDdPgVbQFdESBaF5zEV59bYpeWJ9R8nSuJEszmv8ERYVwxWiRnVrVe6sJXmDYTevCgexdm0WsnEsGomjLSDeJsGsHFLAkovPur-rE7pCorqQMUeSz8p08N_WY8kARDzUa4tPOVSr0rQf5czrxiJ9OU0pwQBfCHLDDGoyUdvhtXy8RlOH7lu3WEU71VtjxbT1vPHPbOZ1DdjkMhhhxq_DptjQdUk_QKcge3Ao7S3VVmPrvkpK0uFlA0tm3f4AuVawEAp4cOUH6jfWSBiGH7G66-bi8UHYIQm1UIiCU48Yd_pe24hfwv5Hc4Gj9QRAAr8ZBytYGa5U8z-2hrv2GaHe8I0wWBaFn_m_J10ikxFbh6splYGOOTfKnoLyt2LcUis-kRGecfvtGd1b8hWz7-xYrYkbvs5fdUJP_hDAFGIdnZHVJUitlhjgKyYDIDMJ-QL4aPUA-QPu-KTB3EKdHqCgQUWvQud4JC2Fd8VXDKig6mQcmHhZEed-6qjx5PYoSifi5wtRDyoSpkkBx39UO3F918tybwIbYQ2TSmgCHzGm32J4Ny7zPt8MPxowRw==&p=0&fvj=1&vjs=3
Additionally, here is my attempt at making a list with the links:
def get_job_titles():
"""this function scrapes the job titles"""
jobs = []
jobTitle = pageSoup.find_all('div', class_='title')
for span in jobTitle:
link = span.find('href')
if link:
jobs.append({'title':link.text,
'href':link.attrs['href']})
else:
jobs.append({'title':span.text, 'href':None})
return jobs
I would regex out from html returned the required info and construct the url from the parameters the page javascript uses to dynamically construct each url. Interestingly, the total number of listings is different when using requests than using browser. You can manually enter the number of listings e.g. 6175 (currently) or use the number returned by the request (which is lower and you miss some results). You could also use selenium to get the correct initial result count). You can then issue requests with offsets to get all listings.
Listings can be randomized in terms of ordering.
It seems you can introduce a limit parameter to increase results_per_page up to 50 e.g.
https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&limit=50&start=0
Furthermore, it seems that it is possible to retrieve more results that are actually given as the total results count on webpage.
py with 10 per page:
import requests, re, hjson, math
import pandas as pd
from bs4 import BeautifulSoup as bs
p = re.compile(r"jobmap\[\d+\]= ({.*?})")
p1 = re.compile(r"var searchUID = '(.*?)';")
counter = 0
final = {}
with requests.Session() as s:
r = s.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
listings_per_page = 10
number_of_listings = int(soup.select_one('[name=description]')['content'].split(' ')[0].replace(',',''))
#number_of_pages = math.ceil(number_of_listings/listings_per_page)
number_of_pages = math.ceil(6175/listings_per_page) #manually calculated
for page in range(1, number_of_pages + 1):
if page > 1:
r = s.get('https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&start={}'.format(10*page-1))
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
for item in p.findall(r.text):
data = hjson.loads(item)
jk = data['jk']
row = {'title' : data['title']
,'company' : data['cmp']
,'url' : f'https://www.indeed.com/viewjob?jk={jk}&tk={tk}&from=serp&vjs=3'
}
final[counter] = row
counter+=1
df = pd.DataFrame(final)
output_df = df.T
output_df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )
If you want to use selenium to get correct initial listings count:
import requests, re, hjson, math
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")
d = webdriver.Chrome(r'C:\Users\HarrisQ\Documents\chromedriver.exe', options = options)
d.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
number_of_listings = int(d.find_element_by_css_selector('[name=description]').get_attribute('content').split(' ')[0].replace(',',''))
d.quit()
p = re.compile(r"jobmap\[\d+\]= ({.*?})")
p1 = re.compile(r"var searchUID = '(.*?)';")
counter = 0
final = {}
with requests.Session() as s:
r = s.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
listings_per_page = 10
number_of_pages = math.ceil(6175/listings_per_page) #manually calculated
for page in range(1, number_of_pages + 1):
if page > 1:
r = s.get('https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&start={}'.format(10*page-1))
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
for item in p.findall(r.text):
data = hjson.loads(item)
jk = data['jk']
row = {'title' : data['title']
,'company' : data['cmp']
,'url' : f'https://www.indeed.com/viewjob?jk={jk}&tk={tk}&from=serp&vjs=3'
}
final[counter] = row
counter+=1
df = pd.DataFrame(final)
output_df = df.T
output_df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )

Scrape page with generator

I scraping a site with Beautiful Soup. The problem I have is that certain parts of the site are paginated with JS, with an unknown (varying) number of pages to scrape.
I'm trying to get around this with a generator, but it's my first time writing one and I'm having a hard time wrapping my head around it and figuring out if what I'm doing makes sense.
Code:
from bs4 import BeautifulSoup
import urllib
import urllib2
import jabba_webkit as jw
import csv
import string
import re
import time
tlds = csv.reader(open("top_level_domains.csv", 'r'), delimiter=';')
sites = csv.writer(open("websites_to_scrape.csv", "w"), delimiter=',')
tld = "uz"
has_next = True
page = 0
def create_link(tld, page):
if page == 0:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain"
else:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain/page/" + repr(page)
return link
def check_for_next(soup):
disabled_nav = soup.find(class_="pagingDivDisabled")
if disabled_nav:
if "Next" in disabled_nav:
return False
else:
return True
else:
return True
def make_soup(link):
html = jw.get_page(link)
soup = BeautifulSoup(html, "lxml")
return soup
def all_the_pages(counter):
while True:
link = create_link(tld, counter)
soup = make_soup(link)
if check_for_next(soup) == True:
yield counter
else:
break
counter += 1
def scrape_page(soup):
table = soup.find('table', {'class': 'rankTable'})
th = table.find('tbody')
test = th.find_all("td")
correct_cells = range(1,len(test),3)
for cell in correct_cells:
#print test[cell]
url = repr(test[cell])
content = re.sub("<[^>]*>", "", url)
sites.writerow([tld]+[content])
def main():
for page in all_the_pages(0):
print page
link = create_link(tld, page)
print link
soup = make_soup(link)
scrape_page(soup)
main()
My thinking behind the code:
The scraper should get the page, determine if there is another page that follows, scrape the current page and move to the next one, repreating the process. If there is no next page, it should stop. Does that make sense how I'm going it here?
As I told you, you could use selenium for programmatically clicking on the Next button, but since that is not an option for you, I can think of the following method to get the number of pages using pure BS4:
import requests
from bs4 import BeautifulSoup
def page_count():
pages = 1
url = "https://domaintyper.com/top-websites/most-popular-websites-with-uz-domain/page/{}"
while True:
html = requests.get(url.format(pages)).content
soup = BeautifulSoup(html)
table = soup.find('table', {'class': 'rankTable'})
if len(table.find_all('tr')) <= 1:
return pages
pages += 1

Categories

Resources