I am currently creating different projects to grasp the concept of web scraping. Currently I am trying to create a database of items from a shoe selling site, but I cant seem to get the data in text form.
I have tried.
from selenium import webdriver
import time
import requests
from bs4 import BeautifulSoup
import numpy
import statistics
import pandas as pd
offset=0
driver=webdriver.Chrome()
listo=[]
while True:
driver.get("https://stockx.com/sneakers?page={offset}".format(offset=offset))
time.sleep(10)
main_div=driver.find_elements_by_xpath('//*[#id="main-content"]/div[2]/div[2]/div/div')
for div in main_div:
links=div.find_elements_by_tag_name("a")
for link in links:
namer=(link.get_attribute('href'))
print(namer)
offset+=0.05
listo.append(namer)
namelist = sorted(set(listo))
for hreflink in namelist:
hreflinks=(hreflink)
driver.get(hreflinks)
time.sleep(10)
LastsaleD=driver.find_elements_by_xpath('//[#id="marketsummary"]/div[2]/div/div[1]
/div[1]')
print(LastsaleD).text
if offset>30:
break
Using Selenium is overkill and less efficient here. The data is found in json format within the <script> tags of the source html. Just do a simple request of the site, pull out the relevant <script> with the json, then parse the json into rows to put into a table.
Also, why increment offset+=0.05? I understand your logic of adding it for ever 20 items on the page, but why not just increment by 1 after that loop through the 20 items? What happens if for whatever reason you get 19 items returned or 21 items? Then your increments will be off for the rest of the loop.
Anyways, here's the code. This will get you going.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'}
at_end = False
offset = 0
rows = []
while at_end == False:
offset+=1
url = "https://stockx.com/sneakers?page={offset}".format(offset=offset)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script', {'type':'application/ld+json'})
for script in scripts:
jsonMatch = re.compile("{.*}")
jsonStr = jsonMatch.search(str(script))[0]
jsonData = json.loads(jsonStr)
if jsonData['#type'] == 'OfferCatalog':
break
listings = jsonData['itemListElement']
for listing in listings:
item = listing['item']
offers = item.pop('offers')
item.update(offers)
if item not in rows:
rows.append(item)
else:
at_end = True
continue
print('Page: %s' %offset)
df = pd.DataFrame(rows)
Output:
print(df)
#type brand ... highPrice priceCurrency
0 AggregateOffer Jordan ... 165 GBP
1 AggregateOffer Jordan ... 226 GBP
2 AggregateOffer Jordan ... 321 GBP
3 AggregateOffer Jordan ... 159 GBP
4 AggregateOffer Jordan ... 190 GBP
.. ... ... ... ... ...
495 AggregateOffer Nike ... 230 GBP
496 AggregateOffer New Balance ... 159 GBP
497 AggregateOffer Nike ... 152 GBP
498 AggregateOffer Nike ... 162 GBP
499 AggregateOffer Nike ... 167 GBP
[500 rows x 14 columns]
Related
i'm new to web scraping and was trying to get a basic webscraping code to work. The code works just fine, the problem is that I cannot get the CSV file to have any information on it it only shows the name of each column that's it with no data. Any help would be appreciated.
import requests
from bs4 import BeautifulSoup
import csv
def scrape_cars(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, "lxml")
cars = []
for car_div in soup.find_all("div", class_="c-search-card"):
car = {}
car["title"] = car_div.find("h2").text.strip()
car["price"] = car_div.find("div", class_="c-search-card__price").text.strip()
car["location"] = car_div.find("div", class_="c-search-card__location").text.strip()
car["year"] = car_div.find("div", class_="c-search-card__year").text.strip()
car["km"] = car_div.find("div", class_="c-search-card__km").text.strip()
car["color"] = car_div.find("div", class_="c-search-card__color").text.strip()
car["carrosserie"] = car_div.find("div", class_="c-search-card__body-type").text.strip()
car["puissance fiscale"] = car_div.find("div", class_="c-search-card__tax-horsepower").text.strip()
car["boite"] = car_div.find("div", class_="c-search-card__transmission").text.strip()
cars.append(car)
return cars
url = "https://www.automobile.tn/fr/occasion"
cars = scrape_cars(url)
# write to CSV file
with open("cars.csv", "w", newline="") as file:
writer = csv.DictWriter(file, fieldnames=["title", "price", "location", "year", "km", "color", "carrosserie", "puissance fiscale", "boite"])
writer.writeheader()
for car in cars:
writer.writerow(car)
this is what i get the csv file
Here is one way of getting that information you're after:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from tqdm import tqdm ## if using jupyter notebook: from tqdm.notebook import tqdm
big_list = []
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)
for x in tqdm(range(1, 25)): ## to get all cars set range to 266
soup = bs(s.get(f'https://www.automobile.tn/fr/occasion/{x}').text, 'html.parser')
cars = soup.select('div[class="occasion-item"]')
for c in cars:
title = c.select_one('h2').text.strip()
price = c.select_one('div[class="price"]').text.strip()
big_list.append((title, price))
## add other elements as needed
df = pd.DataFrame(big_list, columns=['title', 'price'])
# df.to_csv('various_cars.csv') ## uncomment to save as csv
print(df)
Result in terminal:
100%
24/24 [00:25<00:00, 1.08it/s]
title price
0 Mazda CX-5 69 700 DT
1 Mercedes-Benz Classe E 53 000 DT
2 Mercedes-Benz Classe E 252 000 DT
3 Seat Arona 71 500 DT
4 Volkswagen Golf 7 47 000 DT
... ... ...
283 BMW Série 1 74 000 DT
284 BMW Série 3 135 000 DT
285 Volkswagen Golf 7 70 000 DT
286 Mercedes-Benz Classe C coupé 159 000 DT
287 Volkswagen Jetta 36 000 DT
288 rows × 2 columns
I am trying to scrape a webpage Realtor and I am succesful in doing so by using Requests, BS4 but the main problem is sometimes it returns me 1 or sometimes 2 depending if the item is present in listing or not. Both of these items have same tag Div and class name so I can't differentiate them.
My code is below:
import requests
from bs4 import BeautifulSoup
import pandas as pd
html = requests.get('https://www.realtor.com/realestateagents/84664/pg-1')
doc = BeautifulSoup(html.text,'html.parser')
names = []
contacts = []
for_sale = []
sold = []
price_range = []
last_listing_date = []
for box in doc.find_all('div', class_='jsx-3970352998 agent-list-card clearfix'):
names.append(box.find('div', class_='jsx-3970352998 agent-name text-bold').text)
try:
contacts.append(box.find('div', class_='jsx-3970352998 agent-phone hidden-xs hidden-xxs'))
except IndexError:
contacts.append('No contact number found')
property_data = box.find_all('div', class_='jsx-3970352998 agent-detail-item ellipsis')
try:
for_sale.append(property_data[0].span.text)
except:
for_sale.append('None')
try:
sold.append(property_data[1].span.text)
except:
sold.append('0')
price_activity = box.find_all('div', class_='jsx-3970352998 second-column col-lg-6 no-padding')
a = price_activity[0].find_all('div', class_='jsx-3970352998 agent-detail-item')
print(len(a))
try:
price_range.append(a[0].span.text)
print(a[0].span.text)
except IndexError:
print('No activity range found')
price_range.append('No activity range found')
try:
print(a[1].span.text)
last_listing_date.append(a[1].span.text)
except IndexError:
print('No listing data found')
last_listing_date.append('No listing data found')
df = pd.DataFrame(data={'Name':names, 'Contact':contacts, 'Active Listings':for_sale, 'Properties Sold':sold,
'Price Range':price_range, 'Last Listing Date':last_listing_date})
df
And this is my output, you can see I have highlighted with yellow the the values which are getting into wrong column, becaue some listings dont have Activity Range so they only return one thing which is Last Listing Date and my current code is not able to handle it and I am not sure how to tackle this problem. In desired output, they should be in a place where I marked as red dots.
My output
It seems to be that the element locator strategy was not in proper way.
import requests
from bs4 import BeautifulSoup
import pandas as pd
url='https://www.realtor.com/realestateagents/84664/pg-{page}'
data =[]
for page in range(1,6):
req = requests.get(url.format(page=page))
soup = BeautifulSoup(req.text,'html.parser')
for card in soup.select('div.cardWrapper > ul > div'):
names = card.select_one('div[class="jsx-3970352998 agent-name text-bold"]').text
contacts = card.select_one('div[class="jsx-3970352998 agent-group text-semibold ellipsis"]').get_text(strip=True)
for_sale = card.select_one('div[class="jsx-3970352998 agent-detail-item ellipsis"]:nth-child(1) > span').text
sold = card.select_one('div[class="jsx-3970352998 agent-detail-item ellipsis"]:nth-child(1) > span').text
price = card.select_one('div:-soup-contains("Activity range") > span')
price_range = price.text if price else None
date = card.select_one('div:-soup-contains("Listed a house") > span')
last_listing_date = date.text if date else None
data.append({
'names':names,
'contacts':contacts,
'for_sale':for_sale,
'sold':sold,
'price_range':price_range,
'last_listing_date':last_listing_date
})
df = pd.DataFrame(data)
print(df)
Output:
names contacts ... price_range last_listing_date
0 Clint Allred Kw South Valley Keller Williams ... $370K - $1.08M 2022-08-18
1 Martha McMullin The Group Real Estate, LLC ... $495K - $995K 2022-08-18
2 Aren Bybee R and R Realty, LLC ... $115K - $2.49M 2022-08-18
3 Kenny ParcellTeam Equity Real Estate - Utah ... $125K - $1.2M 2022-08-17
4 Eric MossTeam Equity Real Estate - Utah ... $125K - $600K 2022-08-17
.. ... ... ... ... ...
95 Marny Schlopy Coldwell Banker Realty ... $410K - $756K None
96 Amy Laster-Haynes Better Homes and Gardens Real Estate Momentum ... $364K - $2.62M None
97 Raquel Jex Presidio Real Estate Company ... $442K - $442K None
98 Kelly Ercanbrack Unite Real Estate ... $400K - $400K None
99 Camie Jefferies Equity Real Estate - Tooele ... None Reported None
[100 rows x 6 columns]
You should be able to get the data you're looking for like this:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
url = "https://kfcsg.cognizantorderserv.com/nutrition-allergen"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(1, 12)):
r = s.get(f'https://www.realtor.com/realestateagents/84664/pg-{x}', headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
agent_cards = soup.select('div[data-testid="component-agentCard"]')
for a in agent_cards:
name = a.select_one('div.agent-name').get_text(strip=True)
company = a.select_one('div.agent-group').get_text(strip=True)
try:
phone = a.select_one('div.agent-phone').get_text(strip=True)
except Exception as e:
phone = 'Phoneless'
try:
experience = a.select_one('div#agentExperience').get_text(strip=True)
except Exception as e:
experience = 'Quite inexperienced'
try:
h_for_sale = a.select_one('span.sale-sold-count').get_text(strip=True)
except Exception as e:
h_for_sale = 0
big_list.append((name, company, phone, experience, h_for_sale))
df = pd.DataFrame(big_list, columns = ['Name', 'Company', 'Phone', 'Experience', 'For sale'])
print(df)
Result:
Name
Company
Phone
Experience
For sale
0
Martha McMullin
The Group Real Estate, LLC
(303) 638-1033
Experience:8 years
2
1
Aren Bybee
R and R Realty, LLC
(801) 210-1461
Experience:22 years 2 months
31
2
Kenny ParcellTeam
Equity Real Estate - Utah
(801) 794-7777
Experience:26 years 7 months
24
3
Eric MossTeam
Equity Real Estate - Utah
(801) 669-0383
Experience:10 years 5 months
10
4
Chantelle Rees
Equity Real Estate - Results
(801) 636-2515
Quite inexperienced
4
[...]
Using the logic above, you can obtain other info as well and include it into dataframe. BeautifulSoup docs: https://beautiful-soup-4.readthedocs.io/en/latest/index.html
Also, TQDM: https://pypi.org/project/tqdm/
I'm practicing scraping with BeautifulSoup on a job page but my print is returning None for some odd reason, any ideas?
from bs4 import BeautifulSoup
import requests
import csv
url = 'https://jobgether.com/es/oferta/63083ece6d137a0ac6e701e6-part-time-business-psychologist-intern'
website = requests.get(url)
Soup = BeautifulSoup(website.content, 'html.parser')
Title = Soup.find('h5', class_="mb-0 p-2 w-100 bd-highlight fs-22")
print(Title)
That page is being hydrated with data via a javascript API: you can find that API by inspecting Dev tools - network tab, and you can see the information is being pulled as JSON from that API endpoint. This is one way to obtain thaat data, using requests:
import requests
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
url = 'https://filter-api.jobgether.com/api/offer/63083ece6d137a0ac6e701e6?%24populate%5B0%5D%5Bpath%5D=meta.continents&%24populate%5B0%5D%5Bselect%5D=name&%24populate%5B1%5D=meta.countries&%24populate%5B2%5D=meta.regions&%24populate%5B3%5D=meta.cities&%24populate%5B4%5D=meta.studiesArea&%24populate%5B5%5D=meta.salary&%24populate%5B6%5D=meta.languages&%24populate%5B7%5D=meta.hardSkills&%24populate%5B8%5D=meta.industries&%24populate%5B9%5D=meta.technologies&%24populate%5B10%5D%5Bpath%5D=company&%24populate%5B10%5D%5Bselect%5D=name%20meta.logo%20meta.industries%20meta.companyType%20meta.flexiblePolicy%20meta.employees%20meta.mainOfficeLocation%20meta.subOfficeLocation%20status%20description%20meta.mission%20meta.description%20meta.hardSkills%20meta.technologies%20meta.slug&%24populate%5B10%5D%5Bpopulate%5D%5B0%5D=meta.industries&%24populate%5B10%5D%5Bpopulate%5D%5B1%5D=meta.mainOfficeLocation&%24populate%5B10%5D%5Bpopulate%5D%5B2%5D=meta.subOfficeLocation'
r = requests.get(url, headers=headers)
obj = r.json()
print(obj['title'])
print(obj['meta']['apply_url'])
print(obj['meta']['countries'])
df = pd.json_normalize(obj['meta']['hardSkills'])
print(df)
This will display in terminal:
Part-Time Business Psychologist Intern
https://it.linkedin.com/jobs/view/externalApply/3221880417?url=https%3A%2F%2Fteamtailor%2Eassessfirst%2Ecom%2Fjobs%2F1462616-uk-part-time-business-psychologist-student-intern%3Fpromotion%3D464724-trackable-share-link-uk-business-psychologist-li&urlHash=dzk3&trk=public_jobs_apply-link-offsite
[{'_id': '622a65b4671f2c8b98fac83f', 'name': 'United Kingdom', 'alpha_code': 'GBR', 'continent': '622a659af0bac38678ed1398', 'geo': [-0.127758, 51.507351], 'name_es': 'Reino Unido', 'name_fr': 'Royaume-Uni', 'deleted_at': None, 'amount_of_use': 11407, 'alpha_2_code': 'GB'}]
_id id name name_es name_fr category_id status createdAt updatedAt deletedAt hard_skill_categories hard_skill_category
0 623ca7112198fdff24e1a1b0 5 Design Design Design 1 1 0000-00-00 00:00:00 0000-00-00 00:00:00 None Marketing 621d2a97058dc9445a92c4be
1 623ca7112198fdff24e1a249 173 Research Investigación Recherche 8 1 0000-00-00 00:00:00 0000-00-00 00:00:00 None Business 621d2a97058dc9445a92c4c5
2 623ca7112198fdff24e1a24a 174 Science Ciencia Science 8 1 0000-00-00 00:00:00 0000-00-00 00:00:00 None Business 621d2a97058dc9445a92c4c5
3 623ca7112198fdff24e1a292 1165 Customer Success Customer Success Customer Success 4 1 2021-07-07 10:53:19 2021-07-07 10:53:19 None Sales 621d2a97058dc9445a92c4c1
You can print out the full json response, inspect it, dissect it and extract the relevant information from it (it's quite comprehensive).
Relevant documentation for requests:
https://requests.readthedocs.io/en/latest/
And also, pandas documentation:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html
I am working on scraping multiple pages of search results from this website into a neatly formated pandas dataframe.
I've outlined the steps for how I am to finish this task.
1.) Identify information from each result I want to pull (3 things)
2.) Pull all the information from the 3 things into separate lists
3.) Append items in lists through for loop into pandas dataframe
Here is what I've tried so far:
import requests
import pandas as pd
#!pip install bs4
from bs4 import BeautifulSoup as bs
url = 'https://www.federalregister.gov/documents/search?conditions%5Bpublication_date%5D%5Bgte%5D=08%2F28%2F2021&conditions%5Bterm%5D=economy'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
result = requests.get(url, headers=headers)
soup = bs(result.text, 'html.parser')
titles = soup.find_all('h5')
authors = soup.find_all('p')
#dates = soup.find_all('')
#append in for loop
data=[]
for i in range(2,22):
data.append(titles[i].text)
data.append(authors[i].text)
#data.append(dates[i].text)
data=pd.DataFrame()
Before I convert data to a pandas dataframe, I can see the results, but the last line essentially erases the results.
Also, I'm not quite sure how to iterate over the multiple search result pages. I found some code that allows you to pick a starting and ending web page to iterate over like this:
URL = ['https://www.federalregister.gov/documents/search?conditions%5Bpublication_date%5D%5Bgte%5D=08%2F28%2F2021&conditions%5Bterm%5D=economy&page=2',
'https://www.federalregister.gov/documents/search?conditions%5Bpublication_date%5D%5Bgte%5D=08%2F28%2F2021&conditions%5Bterm%5D=economy&page=4']
for url in range(0,2):
req = requests.get(URL[url])
soup = bs(req.text, 'html.parser')
titles = soup.find_all('h5')
print(titles)
The issue I'm having with this approach is that the first page is not formatted the same as all the other pages. Starting on page two, the end of the url reads, "&page=2". Not sure how to account for that.
To summarize the end result I'm looking for would be a dataframe that looks something like this:
Title Author Date
Blah1 Agency1 09/23/2020
Blah2 Agency2 08/22/2018
Blah3 Agency3 06/02/2017
....
Can someone please help point me in the right direction? Very lost on this one.
I think you don't need to parse all pages, just download the csv.
import pandas as pd
import requests
import io
url = 'https://www.federalregister.gov/documents/search?conditions%5Bpublication_date%5D%5Bgte%5D=08%2F28%2F2021&conditions%5Bterm%5D=economy'
url += '&format=csv' # <- Download as CSV
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
result = requests.get(url, headers=headers)
df = pd.read_csv(io.StringIO(result.text))
Output:
>>> df
title type ... pdf_url publication_date
0 Corporate Average Fuel Economy Standards for M... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/03/2021
1 Public Hearing for Corporate Average Fuel Econ... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/14/2021
2 Investigation of Urea Ammonium Nitrate Solutio... Notice ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/08/2021
3 Anchorage Regulations; Mississippi River, Mile... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-08... 08/30/2021
4 Call for Nominations To Serve on the National ... Notice ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/08/2021
.. ... ... ... ... ...
112 Endangered and Threatened Wildlife and Plants;... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/07/2021
113 Energy Conservation Program: Test Procedures f... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/01/2021
114 Taking of Marine Mammals Incidental to Commerc... Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/17/2021
115 Partial Approval and Partial Disapproval of Ai... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/24/2021
116 Clean Air Plans; California; San Joaquin Valle... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/01/2021
[117 rows x 8 columns]
If I understand your question, then here is the working solution. The starting url and the url with page number = 1 are the same thing and I scrape page range(1,5) meaning 4 pages. You can increase or decrease range of page numbers at any time. To store data in csv format, please uncomment the last line.
Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
data = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'}
for page in range(1, 5):
url = 'https://www.federalregister.gov/documents/search?conditions%5Bpublication_date%5D%5Bgte%5D=08%2F28%2F2021&conditions%5Bterm%5D=economy%27&page={page}'.format(page=page)
print(url)
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
tags = soup.find_all('div', class_ ='document-wrapper')
for pro in tags:
title = pro.select_one('h5 a').get_text(strip = True)
author = pro.select_one('p a:nth-child(1)').get_text(strip = True)
date = pro.select_one('p a:nth-child(2)').get_text(strip = True)
data.append([title,author,date])
cols = ["Title", "Author","Date"]
df = pd.DataFrame(data,columns=cols)
print(df)
#df.to_csv("data_info.csv", index = False)
I am currently trying to scrape store location for research project that is aiming to show the effect covid had on different retailers. The retailer I am having issue with currently is " The source". It's a Canadian retailer that has a large amount of store across Canada and has store that are generally small when compared to Best Buy. The store locator page is: https://www.thesource.ca/en-ca/store-finder
The goal for this code is to have a excel file with column of address, postal code and phone number.( I just assume use pandas for this) Those three are also the data I wanna scrape. The code I wrote so far I think is on the right track, the information for the most part is under a table. However I am struggling to get to the 'li' tags and it to loop through the different rows of table. If anyone has a idea on how I would grab the 'li' tags for each of data I want that would be great!
import requests
from bs4 import BeautifulSoup
url = 'https://www.thesource.ca/en-ca/store-finder'
r = requests.get(url)
soup = BeautifulSoup (r,text,'htmlparser')
Locations_table = soup.find('table', class_='storeResultList store-result-list desktop-only')
for locations in Locations_table.find_all('tbody'):
rows = locations.('tr', class_= 'storeItem store-result-row')
for row in rows:
address = row.find('td', class_ ='address')
# trying to get address
# postal
# phone number which I think is not under this table
print(Locations_table)
We are coding according to logic! Once you've a logic so you can parse towards it!
Logic here is that almost of addresses length is 6, where the messed addresses length is 5. so we can clear it up.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from more_itertools import collapse
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
def main(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
goal = [list(x.stripped_strings)
for x in soup.select_one('.storeResultList').select('.address')[1:]]
allin = []
for x in goal:
if len(x) == 5:
x.insert(2, 'N/A')
x[3] = x[3].rsplit(",", 1)
allin.append(list(collapse(x)))
df = pd.DataFrame(
allin, columns=["Name", "Address", "Unit", "City", "State", "Zip", "Phone"])
df.to_csv('data.csv', index=False)
main('https://www.thesource.ca/en-ca/store-finder')
Output:
Name Address Unit City State Zip Phone
0 Optimist Square 4725 Dorchester Rd Unit #B10 NIAGARA FALLS ON L2E 0A8 905-356-0772
1 SEAWAY MALL 800 NIAGARA ST N UNIT #K12 WELLAND ON L3C5Z4 905-735-2136
2 PEN CENTRE 221 GLENDALE AVE N/A ST CATHARINES ON L2T2K9 905-684-1456
3 GRIMSBY SQUARE SC 44 Livingston Ave. Unit #1006A GRIMSBY ON L3M1L1 905-945-9415
4 J & R SPORTS LTD 151 QUEEN ST N/A DUNNVILLE ON N1A1H6 905-774-8872
.. ... ... ... ... ... ... ...
95 KINGSVILLE MAIN ST 410 MAIN ST E UNIT #3/4 KINGSVILLE ON N9Y 1A7 519-733-4138
96 ST. CLAIR SHORES S/C 25 AMY CROFT DRIVE UNIT #15 WINDSOR ON N9K1C7 519-735-5364
97 TECUMSEH MALL D2-7650 TECUMSEH RD E N/A WINDSOR ON N8T1E9 519-974-1421
98 DEVONSHIRE MALL 3100 HOWARD AVE UNIT #SS5 WINDSOR ON N8X3Y8 519-969-2099
99 PLAYIT STAR 105 HENRY STREET WEST N/A PRESCOTT ON K0E1T0 613-925-0776
[100 rows x 7 columns]
To select different li's you can use the :nth-of-type(n) CSS selector.
To use a CSS selector, use the select_one() method instead of .find().
Note:
I added the user-agent header since the page was stuck on loading.
In your example:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.thesource.ca/en-ca/store-finder"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
out = {"Address": [], "Postal": [], "Phone": []}
for tag in soup.select(".details"):
out["Address"].append(tag.select_one("li:nth-of-type(1)").get_text(strip=True))
out["Postal"].append(
tag.select_one("li:last-of-type").get_text(strip=True)
)
out["Phone"].append(tag.select_one("a.tel-link").get_text(strip=True))
df = pd.DataFrame(out)
print(df.to_string())
Output (truncated):
Address Postal Phone
0 4725 Dorchester Rd L2E 0A8 905-356-0772
1 800 NIAGARA ST N L3C5Z4 905-735-2136
2 221 GLENDALE AVE L2T2K9 905-684-1456
3 44 Livingston Ave. L3M1L1 905-945-9415
4 151 QUEEN ST N1A1H6 905-774-8872
You are close: each row object produced by iterating over BeautifulSoup.select('tr.storeItem.store-result-row') can be further selected from to get the li values. In the solution below, a function is used to take in each row and extract the results:
import requests, pandas as pd
from bs4 import BeautifulSoup as soup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'}
d = soup(requests.get('https://www.thesource.ca/en-ca/store-finder', headers=headers).text, 'html.parser')
def store_info(row):
return {'store':row.select_one('td.address .itemName').get_text(strip=True),
'address':', '.join((j:=list(filter(None, [i.text for i in row.select('td.address ul li')])))[:-1]),
'postal_code':j[-1],
'phone':row.select_one('td.address .tel-link').get_text(strip=True)}
results = [store_info(row) for row in d.select('table:nth-of-type(1) tr.storeItem.store-result-row')]
df = pd.DataFrame(results)
Output:
store address postal_code phone
0 Optimist Square 4725 Dorchester Rd, Unit #B10, NIAGARA FALLS, ON L2E 0A8 905-356-0772
1 SEAWAY MALL 800 NIAGARA ST N, UNIT #K12, WELLAND, ON L3C5Z4 905-735-2136
2 PEN CENTRE 221 GLENDALE AVE, ST CATHARINES, ON L2T2K9 905-684-1456
3 GRIMSBY SQUARE SC 44 Livingston Ave., Unit #1006A, GRIMSBY, ON L3M1L1 905-945-9415
4 J & R SPORTS LTD 151 QUEEN ST, DUNNVILLE, ON N1A1H6 905-774-8872
.. ... ... ... ...
95 KINGSVILLE MAIN ST 410 MAIN ST E, UNIT #3/4, KINGSVILLE, ON N9Y 1A7 519-733-4138
96 ST. CLAIR SHORES S/C 25 AMY CROFT DRIVE, UNIT #15, WINDSOR, ON N9K1C7 519-735-5364
97 TECUMSEH MALL D2-7650 TECUMSEH RD E, WINDSOR, ON N8T1E9 519-974-1421
98 DEVONSHIRE MALL 3100 HOWARD AVE, UNIT #SS5, WINDSOR, ON N8X3Y8 519-969-2099
99 PLAYIT STAR 105 HENRY STREET WEST, PRESCOTT, ON K0E1T0 613-925-0776
[100 rows x 4 columns]