webscraping bus stops with beautifulsoup - python

I am trying to web scrape bus stop names for a given line, here is an example page for line 212 https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212. I want to have as an output two lists, one with bus stop names in one direction and the other list with another direction. (It's clearly seen on the web page). I managed to get all names in one list with
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
print(soup.prettify())
all_bus_stops = []
table = soup.find_all('a')
for element in table:
if element.get_text() in all_bus_stops:
continue
else:
all_bus_stops.append(element.get_text())
return all_bus_stops
print(download_bus_schedule('212'))
I guess the solution would be to somehow divide the soup into two parts.

You can use the bs4.element.Tag.findAll method:
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
all_bus_stops = []
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html.parser')
for s in soup.select(".holo-list"):
bus_stops = []
for f in s.findAll("li"):
if f.text not in bus_stops:
bus_stops.append(f.text)
all_bus_stops.append(bus_stops)
return all_bus_stops
print(download_bus_schedule('212'))
Output:
[['Pl.Hallera', 'Pl.Hallera', 'Darwina', 'Namysłowska', 'Rondo Żaba', 'Rogowska', 'Kołowa', 'Dks Targówek', 'Metro Targówek Mieszkaniowy', 'Myszkowska', 'Handlowa', 'Metro Trocka', 'Bieżuńska', 'Jórskiego', 'Łokietka', 'Samarytanka', 'Rolanda', 'Żuromińska', 'Targówek-Ratusz', 'Św.Wincentego', 'Malborska', 'Ch Targówek'],
['Ch Targówek', 'Ch Targówek', 'Malborska', 'Św.Wincentego', 'Targówek-Ratusz', 'Żuromińska', 'Gilarska', 'Rolanda', 'Samarytanka', 'Łokietka', 'Jórskiego', 'Bieżuńska', 'Metro Trocka', 'Metro Trocka', 'Metro Trocka', 'Handlowa', 'Myszkowska', 'Metro Targówek Mieszkaniowy', 'Dks Targówek', 'Kołowa', 'Rogowska', 'Rondo Żaba', '11 Listopada', 'Bródnowska', 'Szymanowskiego', 'Pl.Hallera', 'Pl.Hallera']]

import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
bus_stops_1 = []
bus_stops_2 = []
directions = soup.find_all("ul", {"class":"holo-list"})
for stop in directions[0].find_all("a"):
if stop not in bus_stops_1:
bus_stops_1.append(stop.text.strip())
for stop in directions[1].find_all("a"):
if stop not in bus_stops_2:
bus_stops_2.append(stop.text.strip())
all_bus_stops = (bus_stops_1, bus_stops_2)
return all_bus_stops
print(download_bus_schedule('212')[0])
print(download_bus_schedule('212')[1])

I may have misunderstood as I do not know Polish but see if this helps.
from bs4 import BeautifulSoup
import requests
url = 'https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212'
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "html.parser")
d = {}
for h2 in soup.select('h2.holo-divider'):
d[h2.text] = []
ul = h2.next_sibling
for li in ul.select('li'):
if li.a.text not in d[h2.text]:
d[h2.text].append(li.a.text)
from pprint import pprint
pprint(d)

As all stops are encapsulated in the next un-ordered list, you could use the find_next function of bs4.
e.g.
URL = f"http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l={bus_number}"
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
directions = ["Ch Targówek","Pl.Hallera"]
result = {}
for direction in directions:
header = soup.find(text=direction)
list = header.find_next("ul")
stops_names = [stop.get_text() for stop in list]
result[direction] = stops_names
return result
Plus you might want to use f-string to format your strings as it improves reading and is less error prone.

Related

How do you iterate over BS4 elements that has the same name?

It only scrapes the first table and I'm not sure on how to get it to scrape the second, they both have the same class.
from bs4 import BeautifulSoup
import requests
def getCalendarData(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
for table in soup.find_all('table',class_ = 'ms-schedule-table ms-schedule-table--your' ):
for event in table.find_all('tbody'):
Series = event.find('div',class_ = 'ms-schedule-table-item-main__title').text.strip()
Circuit = event.find('div',class_ = 'ms-schedule-table-item-main__event').text.strip()
Month = event.find('span',class_ = 'ms-schedule-table-date__month').text.strip()
Day = event.find('span',class_ = 'ms-schedule-table-date__day').text.strip()
print(Series,Circuit,Month,Day)
getCalendarData('https://www.motorsport.com/all/schedule/2022/upcoming/')
Your question is misleading, there is no second table on this page, there is only the option to load more data.
Unless you want to switch to selenium, you can also address the resource from which the data is dynamically reloaded.
for p in range(1,3,1):
getCalendarData(f'https://www.motorsport.com/all/schedule/2022/upcoming/?all_event_types=1&p={p}')
Example
A bit more generic with while-loop, to check if there is a load more button:
from bs4 import BeautifulSoup
import requests
url = 'https://www.motorsport.com/all/schedule/2022/upcoming/'
def getCalendarData(table):
for event in table.find_all('tbody'):
Series = event.find('div',class_ = 'ms-schedule-table-item-main__title').text.strip()
Circuit = event.find('div',class_ = 'ms-schedule-table-item-main__event').text.strip()
Month = event.find('span',class_ = 'ms-schedule-table-date__month').text.strip()
Day = event.find('span',class_ = 'ms-schedule-table-date__day').text.strip()
print(Series,Circuit,Month,Day)
while True:
print(f'Scraping url: {url}')
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
getCalendarData(soup.find('table',class_ = 'ms-schedule-table ms-schedule-table--your'))
if soup.select_one('[data-id="nextPage"]'):
url = 'https://www.motorsport.com/'+soup.select_one('[data-id="nextPage"]').get('href')
else:
break

Newbie, scraping table from url and not able to get output in python command prompt

This is a homework assignment, I have to sum up all the data in the span tag and print it, I took out all the info in the span tag and appended it to a list, I dont know how to go beyond that as any funtion I type out of the for loop does not function, also I have to hit enter twice after I run this in the python command prompt to get an output
I am new here, please forgive the format of the question, thanks for the help
import urllib.request, urllib.parse, urllib.error
lst = list()
from bs4 import BeautifulSoup
url = 'http://py4e-data.dr-chuck.net/comments_42.html'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
spans = soup.findAll('span', attrs = {'class' : 'comments'})
for span in spans:
num = int(span.text)
lst.append(num)
print(num)
No need to collect to list or anything if you just adding them together. You can do it like this:
import urllib.request, urllib.parse, urllib.error
lst = list()
from bs4 import BeautifulSoup
url = 'http://py4e-data.dr-chuck.net/comments_42.html'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
spans = soup.findAll('span', attrs = {'class' : 'comments'})
sum = 0
for span in spans:
sum += int(span.text)
print("Total Sum = " + str(sum))
As you have prepared the list of numbers in span, you can have the sum of numbers using sum() function in python. Pass your list as argument of sum().
import urllib.request, urllib.parse, urllib.error
lst = list()
from bs4 import BeautifulSoup
url = 'http://py4e-data.dr-chuck.net/comments_42.html'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
spans = soup.findAll('span', attrs = {'class' : 'comments'})
for span in spans:
num = int(span.text)
lst.append(num)
print(num)
sum_of_nums = sum(lst)
print(sum_of_nums)

Duplicates links in python

Good morning world
I'm new to python and trying out things. I'm trying to remove duplicate links from the below run.
currently their are 253 links that were retrieved. Can someone please help me with this?
import requests
from bs4 import BeautifulSoup
import csv
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
links = soup.find_all("a")
print ('Number of links retrieved: ', len (links))
Convert it to a set and it will remove duplicates:
links = set(soup.find_all("a"))
out:
Number of links retrieved: 244
set will not care about the sort order.
Therefor i used a list with cleaning the href correctly.
Now the len is 123
from bs4 import BeautifulSoup
import requests
r = requests.get("https://www.census.gov/programs-surveys/popest.html")
soup = BeautifulSoup(r.text, 'html.parser')
links = []
for item in soup.findAll("a", href=True):
item = item.get("href")
if item.startswith("h"):
pass
else:
item = f"https://www.census.gov/{item}"
if item not in links:
links.append(item)
print(item)
print(len(links))
Output:
https://www.census.gov/#content
https://www.census.gov/en.html
https://www.census.gov/topics/population/age-and-sex.html
https://www.census.gov/businessandeconomy
https://www.census.gov/topics/education.html
https://www.census.gov/topics/preparedness.html
https://www.census.gov/topics/employment.html
https://www.census.gov/topics/families.html
https://www.census.gov/topics/population/migration.html
https://www.census.gov/geo
https://www.census.gov/topics/health.html
https://www.census.gov/topics/population/hispanic-origin.html
https://www.census.gov/topics/housing.html
https://www.census.gov/topics/income-poverty.html
https://www.census.gov/topics/international-trade.html
https://www.census.gov/topics/population.html
https://www.census.gov/topics/population/population-estimates.html
https://www.census.gov/topics/public-sector.html
https://www.census.gov/topics/population/race.html
https://www.census.gov/topics/research.html
https://www.census.gov/topics/public-sector/voting.html
https://www.census.gov/about/index.html
https://www.census.gov/data
https://www.census.gov/academy
https://www.census.gov/about/what/admin-data.html
https://www.census.gov/data/data-tools.html
https://www.census.gov/developers/
https://www.census.gov/data/experimental-data-products.html
https://www.census.gov/data/related-sites.html
https://www.census.gov/data/software.html
https://www.census.gov/data/tables.html
https://www.census.gov/data/training-workshops.html
https://www.census.gov/library/visualizations.html
https://www.census.gov/library.html
https://www.census.gov/AmericaCounts
https://www.census.gov/library/audio.html
https://www.census.gov/library/fact-sheets.html
https://www.census.gov/library/photos.html
https://www.census.gov/library/publications.html
https://www.census.gov/library/video.html
https://www.census.gov/library/working-papers.html
https://www.census.gov/programs-surveys/are-you-in-a-survey.html
https://www.census.gov/programs-surveys/decennial-census/2020census-redirect.html
https://www.census.gov/2020census
https://www.census.gov/programs-surveys/acs
https://www.census.gov/programs-surveys/ahs.html
https://www.census.gov/programs-surveys/abs.html
https://www.census.gov/programs-surveys/asm.html
https://www.census.gov/programs-surveys/cog.html
https://www.census.gov/programs-surveys/cbp.html
https://www.census.gov/programs-surveys/cps.html
https://www.census.gov/EconomicCensus
https://www.census.gov/internationalprograms
https://www.census.gov/programs-surveys/metro-micro.html
https://www.census.gov/popest
https://www.census.gov/programs-surveys/popproj.html
https://www.census.gov/programs-surveys/saipe.html
https://www.census.gov/programs-surveys/susb.html
https://www.census.gov/programs-surveys/sbo.html
https://www.census.gov/sipp/
https://www.census.gov/programs-surveys/surveys-programs.html
https://www.census.gov/newsroom.html
https://www.census.gov/partners
https://www.census.gov/programs-surveys/sis.html
https://www.census.gov/NAICS
https://www.census.gov/library/reference/code-lists/schedule/b.html
https://www.census.gov/data/developers/data-sets/Geocoding-services.html
https://www.census.gov/about-us
https://www.census.gov/about/who.html
https://www.census.gov/about/what.html
https://www.census.gov/about/business-opportunities.html
https://www.census.gov/careers
https://www.census.gov/fieldjobs
https://www.census.gov/about/history.html
https://www.census.gov/about/policies.html
https://www.census.gov/privacy
https://www.census.gov/regions
https://www.census.gov/about/contact-us/staff-finder.html
https://www.census.gov/about/contact-us.html
https://www.census.gov/about/faqs.html
https://www.commerce.gov/
https://www.census.gov//en.html
https://www.census.gov//programs-surveys.html
https://www.census.gov//popest
https://www.census.gov//programs-surveys/popest/about.html
https://www.census.gov//programs-surveys/popest/data.html
https://www.census.gov//programs-surveys/popest/geographies.html
https://www.census.gov//programs-surveys/popest/guidance.html
https://www.census.gov//programs-surveys/popest/guidance-geographies.html
https://www.census.gov//programs-surveys/popest/library.html
https://www.census.gov//programs-surveys/popest/news.html
https://www.census.gov//programs-surveys/popest/technical-documentation.html
https://www.census.gov//programs-surveys/popest/data/tables.html
https://www.census.gov//programs-surveys/popest/about/schedule.html
https://www.census.gov//newsroom/press-releases/2019/popest-nation.html
https://www.census.gov//newsroom/press-releases/2019/popest-nation/popest-nation-spanish.html
https://www.census.gov//newsroom/press-releases/2019/new-years-2020.html
https://www.census.gov//data/tables/time-series/demo/popest/pre-1980-national.html
https://www.census.gov//data/tables/time-series/demo/popest/pre-1980-state.html
https://www.census.gov//data/tables/time-series/demo/popest/pre-1980-county.html
https://www.census.gov//library/publications/2015/demo/p25-1142.html
https://www.census.gov//library/publications/2010/demo/p25-1139.html
https://www.census.gov//library/publications/2010/demo/p25-1138.html
https://www.census.gov//programs-surveys/popest/library/publications.html
https://www.census.gov//library/visualizations/2020/comm/superbowl.html
https://www.census.gov//library/visualizations/2019/comm/slower-growth-nations-pop.html
https://www.census.gov//library/visualizations/2019/comm/happy-new-year-2020.html
https://www.census.gov//programs-surveys/popest/library/visualizations.html
https://www.census.gov/#
https://www.census.gov/#uscb-nav-skip-header
https://www.census.gov/newsroom/blogs.html
https://www.census.gov/newsroom/stories.html
https://www.facebook.com/uscensusbureau
https://twitter.com/uscensusbureau
https://www.linkedin.com/company/us-census-bureau
https://www.youtube.com/user/uscensusbureau
https://www.instagram.com/uscensusbureau/
https://www.census.gov/quality/
https://www.census.gov/datalinkage
https://www.census.gov/about/policies/privacy/privacy-policy.html#accessibility
https://www.census.gov/foia
https://www.usa.gov/
https://www.census.gov//
123

How do I create a dataframe of jobs and companies that includes hyperlinks?

I am making a function to print a list of links so I can add them to a list of companies and job titles. However, I am having difficulties navigating tag sub-contents. I am looking to list all the 'href' in 'a' in 'div' like so:
from bs4 import BeautifulSoup
import re
import pandas as pd
import requests
page = "https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html"
headers = {'User-Agent':'Mozilla/5.0'}
def get_soup():
session = requests.Session()
pageTree = session.get(page, headers=headers)
return BeautifulSoup(pageTree.content, 'html.parser')
pageSoup = get_soup()
def print_links():
"""this function scrapes the job title links"""
jobLink = [div.a for div in pageSoup.find_all('div', class_='title')]
for div in jobLink:
print(div['href'])
I am trying to make a list but my result is simply text and does not seem to be a link like so:
/pagead/clk?mo=r&ad=-6NYlbfkN0DhVAxkc_TxySVbUOs6bxWYWOfhmDTNcVTjFFBAY1FXZ2RjSBnfHw4gS8ZdlOOq-xx2DHOyKEivyG9C4fWOSDdPgVbQFdESBaF5zEV59bYpeWJ9R8nSuJEszmv8ERYVwxWiRnVrVe6sJXmDYTevCgexdm0WsnEsGomjLSDeJsGsHFLAkovPur-rE7pCorqQMUeSz8p08N_WY8kARDzUa4tPOVSr0rQf5czrxiJ9OU0pwQBfCHLDDGoyUdvhtXy8RlOH7lu3WEU71VtjxbT1vPHPbOZ1DdjkMhhhxq_DptjQdUk_QKcge3Ao7S3VVmPrvkpK0uFlA0tm3f4AuVawEAp4cOUH6jfWSBiGH7G66-bi8UHYIQm1UIiCU48Yd_pe24hfwv5Hc4Gj9QRAAr8ZBytYGa5U8z-2hrv2GaHe8I0wWBaFn_m_J10ikxFbh6splYGOOTfKnoLyt2LcUis-kRGecfvtGd1b8hWz7-xYrYkbvs5fdUJP_hDAFGIdnZHVJUitlhjgKyYDIDMJ-QL4aPUA-QPu-KTB3EKdHqCgQUWvQud4JC2Fd8VXDKig6mQcmHhZEed-6qjx5PYoSifi5wtRDyoSpkkBx39UO3F918tybwIbYQ2TSmgCHzGm32J4Ny7zPt8MPxowRw==&p=0&fvj=1&vjs=3
Additionally, here is my attempt at making a list with the links:
def get_job_titles():
"""this function scrapes the job titles"""
jobs = []
jobTitle = pageSoup.find_all('div', class_='title')
for span in jobTitle:
link = span.find('href')
if link:
jobs.append({'title':link.text,
'href':link.attrs['href']})
else:
jobs.append({'title':span.text, 'href':None})
return jobs
I would regex out from html returned the required info and construct the url from the parameters the page javascript uses to dynamically construct each url. Interestingly, the total number of listings is different when using requests than using browser. You can manually enter the number of listings e.g. 6175 (currently) or use the number returned by the request (which is lower and you miss some results). You could also use selenium to get the correct initial result count). You can then issue requests with offsets to get all listings.
Listings can be randomized in terms of ordering.
It seems you can introduce a limit parameter to increase results_per_page up to 50 e.g.
https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&limit=50&start=0
Furthermore, it seems that it is possible to retrieve more results that are actually given as the total results count on webpage.
py with 10 per page:
import requests, re, hjson, math
import pandas as pd
from bs4 import BeautifulSoup as bs
p = re.compile(r"jobmap\[\d+\]= ({.*?})")
p1 = re.compile(r"var searchUID = '(.*?)';")
counter = 0
final = {}
with requests.Session() as s:
r = s.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
listings_per_page = 10
number_of_listings = int(soup.select_one('[name=description]')['content'].split(' ')[0].replace(',',''))
#number_of_pages = math.ceil(number_of_listings/listings_per_page)
number_of_pages = math.ceil(6175/listings_per_page) #manually calculated
for page in range(1, number_of_pages + 1):
if page > 1:
r = s.get('https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&start={}'.format(10*page-1))
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
for item in p.findall(r.text):
data = hjson.loads(item)
jk = data['jk']
row = {'title' : data['title']
,'company' : data['cmp']
,'url' : f'https://www.indeed.com/viewjob?jk={jk}&tk={tk}&from=serp&vjs=3'
}
final[counter] = row
counter+=1
df = pd.DataFrame(final)
output_df = df.T
output_df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )
If you want to use selenium to get correct initial listings count:
import requests, re, hjson, math
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")
d = webdriver.Chrome(r'C:\Users\HarrisQ\Documents\chromedriver.exe', options = options)
d.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
number_of_listings = int(d.find_element_by_css_selector('[name=description]').get_attribute('content').split(' ')[0].replace(',',''))
d.quit()
p = re.compile(r"jobmap\[\d+\]= ({.*?})")
p1 = re.compile(r"var searchUID = '(.*?)';")
counter = 0
final = {}
with requests.Session() as s:
r = s.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
listings_per_page = 10
number_of_pages = math.ceil(6175/listings_per_page) #manually calculated
for page in range(1, number_of_pages + 1):
if page > 1:
r = s.get('https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&start={}'.format(10*page-1))
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
for item in p.findall(r.text):
data = hjson.loads(item)
jk = data['jk']
row = {'title' : data['title']
,'company' : data['cmp']
,'url' : f'https://www.indeed.com/viewjob?jk={jk}&tk={tk}&from=serp&vjs=3'
}
final[counter] = row
counter+=1
df = pd.DataFrame(final)
output_df = df.T
output_df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )

Unable to return all the results at once

I've written a script in python to fetch some links from a webpage. There are two functions within my script. The first function collect links to the local businesses from a webpage and the second function traverses those links and collect urls to the various events.
When I try with the script found here, I get desired results.
How can I return all the results complying the below design?
The following script return the results of individual links whereas I wish to return all the result at once keeping the design as it is (logic may vary).
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
linklist = []
def collect_links(link):
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
items = [urljoin(url,item.get("href")) for item in soup.select(".business-listings-category-list .field-content a[hreflang]")]
return items
def fetch_info(ilink):
res = requests.get(ilink)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select(".business-teaser-title a[title]"):
linklist.append(urljoin(url,item.get("href")))
return linklist
if __name__ == '__main__':
url = "https://www.parentmap.com/atlas"
for itemlink in collect_links(url):
print(fetch_info(itemlink))
First of all I removed the global linklist as it is returned from the function anyway, and keeping global creates overlapping results. Next I added a function to "assemble" the links the way you wanted. I used a set to prevent duplicate links.
#!/usr/bin/python
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def collect_links(link):
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
items = [urljoin(url,item.get("href")) for item in soup.select(".business-listings-category-list .field-content a[hreflang]")]
return items
def fetch_info(ilink):
linklist = []
res = requests.get(ilink)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select(".business-teaser-title a[title]"):
linklist.append(urljoin(url,item.get("href")))
return linklist
def fetch_all_links(url):
links = set()
for itemlink in collect_links(url):
links.update(fetch_info(itemlink))
return list(links)
if __name__ == '__main__':
url = "https://www.parentmap.com/atlas"
print(fetch_all_links(url))
The main reason you are getting results one after another is you are calling fetchinfo in a loop which calls function, again and again, resulting in printing data one after another rather than using a loop in fetchinfo function.Try code below
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
linklist = []
def collect_links(link):
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
items = [urljoin(url,item.get("href")) for item in soup.select(".business-listings-category-list .field-content a[hreflang]")]
return items
def fetch_info(url):
for itemlink in collect_links(url):
res = requests.get(ilink)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select(".business-teaser-title a[title]"):
linklist.append(urljoin(url,item.get("href")))
return linklist
if __name__ == '__main__':
url = "https://www.parentmap.com/atlas"
for itemlink in collect_links(url):
print(fetch_info(itemlink))

Categories

Resources