Python - web crawling / different result from same code? / requests, bs4 / M1

Python - web crawling / different result from same code? / requests, bs4 / M1 - python

I learning python for web crawling, but i'm totally stuck.
Each time I run this codes, results change.
very rarely, it works but almost return empty list.
why does it happen? please let me know
from indeed import extract_indeed_pages, extract_indeed_jobs
last_indeed_page = extract_indeed_pages()
print(last_indeed_page)
indeed_jobs = extract_indeed_jobs(last_indeed_page)
print(indeed_jobs)
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = f"https://kr.indeed.com/jobs?q=React&l=%EC%84%9C%EC%9A%B8&radius=100&jt=fulltime&limit={LIMIT}"
def extract_indeed_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pagination = soup.find("div", {"class": "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
def extract_indeed_jobs(last_page):
jobs = []
result = requests.get(f"{URL}&start={0*LIMIT}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("h2", {"class": "jobTitle"})
jobs.append(results)
return jobs

This happens because of the javascript on the source code. You can view the web page by pressing the ctrl + u buttons on your pc.

Related

How do you iterate over BS4 elements that has the same name?

It only scrapes the first table and I'm not sure on how to get it to scrape the second, they both have the same class.
from bs4 import BeautifulSoup
import requests
def getCalendarData(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
for table in soup.find_all('table',class_ = 'ms-schedule-table ms-schedule-table--your' ):
for event in table.find_all('tbody'):
Series = event.find('div',class_ = 'ms-schedule-table-item-main__title').text.strip()
Circuit = event.find('div',class_ = 'ms-schedule-table-item-main__event').text.strip()
Month = event.find('span',class_ = 'ms-schedule-table-date__month').text.strip()
Day = event.find('span',class_ = 'ms-schedule-table-date__day').text.strip()
print(Series,Circuit,Month,Day)
getCalendarData('https://www.motorsport.com/all/schedule/2022/upcoming/')

Your question is misleading, there is no second table on this page, there is only the option to load more data.
Unless you want to switch to selenium, you can also address the resource from which the data is dynamically reloaded.
for p in range(1,3,1):
getCalendarData(f'https://www.motorsport.com/all/schedule/2022/upcoming/?all_event_types=1&p={p}')
Example
A bit more generic with while-loop, to check if there is a load more button:
from bs4 import BeautifulSoup
import requests
url = 'https://www.motorsport.com/all/schedule/2022/upcoming/'
def getCalendarData(table):
for event in table.find_all('tbody'):
Series = event.find('div',class_ = 'ms-schedule-table-item-main__title').text.strip()
Circuit = event.find('div',class_ = 'ms-schedule-table-item-main__event').text.strip()
Month = event.find('span',class_ = 'ms-schedule-table-date__month').text.strip()
Day = event.find('span',class_ = 'ms-schedule-table-date__day').text.strip()
print(Series,Circuit,Month,Day)
while True:
print(f'Scraping url: {url}')
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
getCalendarData(soup.find('table',class_ = 'ms-schedule-table ms-schedule-table--your'))
if soup.select_one('[data-id="nextPage"]'):
url = 'https://www.motorsport.com/'+soup.select_one('[data-id="nextPage"]').get('href')
else:
break

webscraping bus stops with beautifulsoup

I am trying to web scrape bus stop names for a given line, here is an example page for line 212 https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212. I want to have as an output two lists, one with bus stop names in one direction and the other list with another direction. (It's clearly seen on the web page). I managed to get all names in one list with
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
print(soup.prettify())
all_bus_stops = []
table = soup.find_all('a')
for element in table:
if element.get_text() in all_bus_stops:
continue
else:
all_bus_stops.append(element.get_text())
return all_bus_stops
print(download_bus_schedule('212'))
I guess the solution would be to somehow divide the soup into two parts.

You can use the bs4.element.Tag.findAll method:
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
all_bus_stops = []
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html.parser')
for s in soup.select(".holo-list"):
bus_stops = []
for f in s.findAll("li"):
if f.text not in bus_stops:
bus_stops.append(f.text)
all_bus_stops.append(bus_stops)
return all_bus_stops
print(download_bus_schedule('212'))
Output:
[['Pl.Hallera', 'Pl.Hallera', 'Darwina', 'Namysłowska', 'Rondo Żaba', 'Rogowska', 'Kołowa', 'Dks Targówek', 'Metro Targówek Mieszkaniowy', 'Myszkowska', 'Handlowa', 'Metro Trocka', 'Bieżuńska', 'Jórskiego', 'Łokietka', 'Samarytanka', 'Rolanda', 'Żuromińska', 'Targówek-Ratusz', 'Św.Wincentego', 'Malborska', 'Ch Targówek'],
['Ch Targówek', 'Ch Targówek', 'Malborska', 'Św.Wincentego', 'Targówek-Ratusz', 'Żuromińska', 'Gilarska', 'Rolanda', 'Samarytanka', 'Łokietka', 'Jórskiego', 'Bieżuńska', 'Metro Trocka', 'Metro Trocka', 'Metro Trocka', 'Handlowa', 'Myszkowska', 'Metro Targówek Mieszkaniowy', 'Dks Targówek', 'Kołowa', 'Rogowska', 'Rondo Żaba', '11 Listopada', 'Bródnowska', 'Szymanowskiego', 'Pl.Hallera', 'Pl.Hallera']]

import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
bus_stops_1 = []
bus_stops_2 = []
directions = soup.find_all("ul", {"class":"holo-list"})
for stop in directions[0].find_all("a"):
if stop not in bus_stops_1:
bus_stops_1.append(stop.text.strip())
for stop in directions[1].find_all("a"):
if stop not in bus_stops_2:
bus_stops_2.append(stop.text.strip())
all_bus_stops = (bus_stops_1, bus_stops_2)
return all_bus_stops
print(download_bus_schedule('212')[0])
print(download_bus_schedule('212')[1])

I may have misunderstood as I do not know Polish but see if this helps.
from bs4 import BeautifulSoup
import requests
url = 'https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212'
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "html.parser")
d = {}
for h2 in soup.select('h2.holo-divider'):
d[h2.text] = []
ul = h2.next_sibling
for li in ul.select('li'):
if li.a.text not in d[h2.text]:
d[h2.text].append(li.a.text)
from pprint import pprint
pprint(d)

As all stops are encapsulated in the next un-ordered list, you could use the find_next function of bs4.
e.g.
URL = f"http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l={bus_number}"
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
directions = ["Ch Targówek","Pl.Hallera"]
result = {}
for direction in directions:
header = soup.find(text=direction)
list = header.find_next("ul")
stops_names = [stop.get_text() for stop in list]
result[direction] = stops_names
return result
Plus you might want to use f-string to format your strings as it improves reading and is less error prone.

Object Subscriptable

While scraping the website, I am getting this error:
links = [tag.a["href"] for tag in soup.find_all('strong')[1:-3]]
TypeError: 'NoneType' object is not subscriptable
Code:
import requests
import concurrent.futures
from bs4 import BeautifulSoup
HOST = "https://www.lyrics.com"
url = "https://www.lyrics.com/album/3769520/Now+20th+Anniversary%2C+Vol.+2"
# Parse the initial 'album' website
req = requests.get(url)
html = req.content
soup = BeautifulSoup(html , 'html.parser')
# Find all song's links in 'album' site - these can be found under
# the 'strong' tab, and 'a' tab
links = [tag.a["href"] for tag in soup.find_all('strong')[1:-3]]
name = []
def getLyrics(url):
url = HOST + url # songs are found on the HOST website
# Parse 'song' site
req = requests.get(url)
html = req.content
soup = BeautifulSoup(html , 'html.parser')
# Obtain the lyrics, which can be found under the 'pre' tab
names = soup.find('h1',{"id":"lyric-title-text"})
name.append((names.text)+".txt")
return soup.find('pre').text
# Use multi-threading for faster performance - I'll give a small run down:
# max_workers = number of threads - we use an individual thread for each song
lyric = []
with concurrent.futures.ThreadPoolExecutor(max_workers=len(links)) as executor:
# for every song...
for j in range(len(links)):
# run the 'getLyrics' method on an individual thread and get the lyrics
lyrics = executor.submit(getLyrics, links[j]).result()
print(lyrics)
lyric.append(lyrics)
# do whatever with the lyrics ... I simply printed them
for i in range(0 ,len(name)-1):
File = open(name[i],"w")
File.write(lyric[i])
File.close()
I will be very thankful if you could help me.

you can check if tag.a is not None:
links = [tag.a['href'] for tag in soup.find_all('strong') if tag.a is not None][1:-3]
print(links)
# output ['/lyric/35873929/Tik+Tok+%5BNOW+33%5D', ...]

Scrape all images of off a multiple pages site?

I need to scrape all the images of the pages of the url given in the code but i could only do it manually each page till the last page(100th page).
This is the code for scraping each page and i replace the page number each time and run the code!
Down below
Is there any way to add a variable function and running a loop till it gets an error in this case a 404 page (since no more pages would be left)?
from bs4 import*
import requests as rq
r2 = rq.get("https://www.gettyimages.in/photos/aishwarya-rai?family=editorial&page=1&phrase=aishwarya%20rai&sort=mostpopular")
soup2 = BeautifulSoup(r2.text, "html.parser")
links = []
x = soup2.select('img[src^="https://media.gettyimages.com/photos/"]') #the frame where it shows the images
for img in x:
links.append(img['src'])
for index, img_link in enumerate(links):
img_data = rq.get(img_link).content
with open("aishwarya_rai/"+str(index+2)+'.jpg', 'wb+') as f:
f.write(img_data)
else:
f.close()
The page ranges from 1 to 100.
I need some additional code which makes the "page value" a variable and loops till 100

Use format() function and pass the page variable.
from bs4 import*
import requests as rq
url="https://www.gettyimages.in/photos/aishwarya-rai?family=editorial&page={}&phrase=aishwarya%20rai&sort=mostpopular"
links = []
for page in range(1,101):
print(url.format(page))
r2 = rq.get(url.format(page))
soup2 = BeautifulSoup(r2.text, "html.parser")
x = soup2.select('img[src^="https://media.gettyimages.com/photos/"]')
for img in x:
links.append(img['src'])
print(links)

Scrape page with generator

I scraping a site with Beautiful Soup. The problem I have is that certain parts of the site are paginated with JS, with an unknown (varying) number of pages to scrape.
I'm trying to get around this with a generator, but it's my first time writing one and I'm having a hard time wrapping my head around it and figuring out if what I'm doing makes sense.
Code:
from bs4 import BeautifulSoup
import urllib
import urllib2
import jabba_webkit as jw
import csv
import string
import re
import time
tlds = csv.reader(open("top_level_domains.csv", 'r'), delimiter=';')
sites = csv.writer(open("websites_to_scrape.csv", "w"), delimiter=',')
tld = "uz"
has_next = True
page = 0
def create_link(tld, page):
if page == 0:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain"
else:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain/page/" + repr(page)
return link
def check_for_next(soup):
disabled_nav = soup.find(class_="pagingDivDisabled")
if disabled_nav:
if "Next" in disabled_nav:
return False
else:
return True
else:
return True
def make_soup(link):
html = jw.get_page(link)
soup = BeautifulSoup(html, "lxml")
return soup
def all_the_pages(counter):
while True:
link = create_link(tld, counter)
soup = make_soup(link)
if check_for_next(soup) == True:
yield counter
else:
break
counter += 1
def scrape_page(soup):
table = soup.find('table', {'class': 'rankTable'})
th = table.find('tbody')
test = th.find_all("td")
correct_cells = range(1,len(test),3)
for cell in correct_cells:
#print test[cell]
url = repr(test[cell])
content = re.sub("<[^>]*>", "", url)
sites.writerow([tld]+[content])
def main():
for page in all_the_pages(0):
print page
link = create_link(tld, page)
print link
soup = make_soup(link)
scrape_page(soup)
main()
My thinking behind the code:
The scraper should get the page, determine if there is another page that follows, scrape the current page and move to the next one, repreating the process. If there is no next page, it should stop. Does that make sense how I'm going it here?

As I told you, you could use selenium for programmatically clicking on the Next button, but since that is not an option for you, I can think of the following method to get the number of pages using pure BS4:
import requests
from bs4 import BeautifulSoup
def page_count():
pages = 1
url = "https://domaintyper.com/top-websites/most-popular-websites-with-uz-domain/page/{}"
while True:
html = requests.get(url.format(pages)).content
soup = BeautifulSoup(html)
table = soup.find('table', {'class': 'rankTable'})
if len(table.find_all('tr')) <= 1:
return pages
pages += 1

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python - web crawling / different result from same code? / requests, bs4 / M1 - python

This happens because of the javascript on the source code. You can view the web page by pressing the ctrl + u buttons on your pc.

Related

How do you iterate over BS4 elements that has the same name?

webscraping bus stops with beautifulsoup

Object Subscriptable

Scrape all images of off a multiple pages site?

Scrape page with generator

Categories

Resources