i'm new to web scraping and was trying to get a basic webscraping code to work. The code works just fine, the problem is that I cannot get the CSV file to have any information on it it only shows the name of each column that's it with no data. Any help would be appreciated.
import requests
from bs4 import BeautifulSoup
import csv
def scrape_cars(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, "lxml")
cars = []
for car_div in soup.find_all("div", class_="c-search-card"):
car = {}
car["title"] = car_div.find("h2").text.strip()
car["price"] = car_div.find("div", class_="c-search-card__price").text.strip()
car["location"] = car_div.find("div", class_="c-search-card__location").text.strip()
car["year"] = car_div.find("div", class_="c-search-card__year").text.strip()
car["km"] = car_div.find("div", class_="c-search-card__km").text.strip()
car["color"] = car_div.find("div", class_="c-search-card__color").text.strip()
car["carrosserie"] = car_div.find("div", class_="c-search-card__body-type").text.strip()
car["puissance fiscale"] = car_div.find("div", class_="c-search-card__tax-horsepower").text.strip()
car["boite"] = car_div.find("div", class_="c-search-card__transmission").text.strip()
cars.append(car)
return cars
url = "https://www.automobile.tn/fr/occasion"
cars = scrape_cars(url)
# write to CSV file
with open("cars.csv", "w", newline="") as file:
writer = csv.DictWriter(file, fieldnames=["title", "price", "location", "year", "km", "color", "carrosserie", "puissance fiscale", "boite"])
writer.writeheader()
for car in cars:
writer.writerow(car)
this is what i get the csv file
Here is one way of getting that information you're after:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from tqdm import tqdm ## if using jupyter notebook: from tqdm.notebook import tqdm
big_list = []
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)
for x in tqdm(range(1, 25)): ## to get all cars set range to 266
soup = bs(s.get(f'https://www.automobile.tn/fr/occasion/{x}').text, 'html.parser')
cars = soup.select('div[class="occasion-item"]')
for c in cars:
title = c.select_one('h2').text.strip()
price = c.select_one('div[class="price"]').text.strip()
big_list.append((title, price))
## add other elements as needed
df = pd.DataFrame(big_list, columns=['title', 'price'])
# df.to_csv('various_cars.csv') ## uncomment to save as csv
print(df)
Result in terminal:
100%
24/24 [00:25<00:00, 1.08it/s]
title price
0 Mazda CX-5 69 700 DT
1 Mercedes-Benz Classe E 53 000 DT
2 Mercedes-Benz Classe E 252 000 DT
3 Seat Arona 71 500 DT
4 Volkswagen Golf 7 47 000 DT
... ... ...
283 BMW Série 1 74 000 DT
284 BMW Série 3 135 000 DT
285 Volkswagen Golf 7 70 000 DT
286 Mercedes-Benz Classe C coupé 159 000 DT
287 Volkswagen Jetta 36 000 DT
288 rows × 2 columns
Related
I need your help trying to automate this web page by getting the data of all the players on the different pages.
import request
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.mlb.com/es/stats/spring-training'
pagina = requests.get(url2)
soup = BeautifulSoup(pagina.text, 'lxml')
table = soup.find('table', {'class':"bui-table is-desktop-sKqjv9Sb"})
encabezados = []
for i in table.find_all('th')[:18]:
datos = i.find_all('button')
for td in datos:
titulo = td.text.strip()
encabezados.append(titulo)
datos_mlb = pd.DataFrame(columns = encabezados)
nombres = []
for i in table.find_all('th')[18:]:
datos = i.find_all('a')
for td in datos:
jugadores = td.text.strip()
nombres.append(jugadores)
datos_mlb['JUGADOR'] = nombres
for fila in table.find_all('tr')[1:]:
data = fila.find_all('td')
data_fila = [td.text.strip() for td in data]
largo = len(datos_mlb)-1
datos_mlb.iloc[:,1:] = data_fila
I have tried to fit the vast majority of information, however I cannot complete the data correctly and iterate all the pages.
Try to use the structured data from JSON response of XHR request to create your dataframe. Inspect network tab in your browsers devtools, to get an idea, what parameters you should send and what you will get:
import pandas as pd
import requests
data = []
for i in range(0,175,25):
data.extend(
requests.get(
f'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=1&stats=season&group=hitting&gameType=S&limit=25&offset={i}&sortStat=onBasePlusSlugging&order=desc',
headers = {'user-agent': 'Mozilla/5.0'}
).json()['stats']
)
pd.DataFrame(data)
Output
playerId
playerName
...
type
atBatsPerHomeRun
0
502671
Paul Goldschmidt
...
player
5.5
1
621439
Byron Buxton
...
player
6.4
2
547180
Bryce Harper
...
player
4.38
3
658668
Edward Olivares
...
player
11.33
4
670351
Jose Rojas
...
player
9
...
...
...
156
593871
Jorge Polanco
...
player
32.00
157
676475
Alec Burleson
...
player
-.--
158
608385
Jesse Winker
...
player
-.--
159
641355
Cody Bellinger
...
player
-.--
160
660162
Yoan Moncada
...
player
-.--
[161 rows x 72 columns]
You are not getting all the required data because data is loaded dynamically via API.So you have to pull data from API.
Example:
import pandas as pd
import requests
api_url = 'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=1&stats=season&group=hitting&gameType=S&limit=161&offset=0&sortStat=onBasePlusSlugging&order=desc'
req = requests.get(api_url).json()
data =[]
for item in req['stats']:
playerName=item['playerName']
data.append({
'playerName':playerName
})
df = pd.DataFrame(data)
print(df)
Output:
playerName
0 Paul Goldschmidt
1 Byron Buxton
2 Bryce Harper
3 Edward Olivares
4 Jose Rojas
.. ...
156 Jorge Polanco
157 Alec Burleson
158 Jesse Winker
159 Cody Bellinger
160 Yoan Moncada
[161 rows x 1 columns]
So i have been trying to write data scraper for online shop with cables and other stuff.
I wrote simple code that should work. Shop has structure of products divided to categories and i took on first category with cables.
for i in range(0, 27):
url = "https://onninen.pl/produkty/Kable-i-przewody?query=/strona:{0}"
url = url.format(i)
and it works fine for first two pages with i = to 0 and 1 (i get code_response 200) but no matter what time i try other pages 2+ returns error 500 and i have no idea why especially when they open normally from the same link manually.
I even tried to randomize time between requests :(
Any idea what might be the problem ? Should i try using other web scraping library ?
Below is full code :
import requests
from fake_useragent import UserAgent
import pandas as pd
from bs4 import BeautifulSoup
import time
import random
products = [] # List to store name of the product
MIN = [] # Manufacturer item number
prices = [] # List to store price of the product
df = pd.DataFrame()
user_agent = UserAgent()
i = 0
for i in range(0, 27):
url = "https://onninen.pl/produkty/Kable-i-przewody?query=/strona:{0}"
url = url.format(i)
#print(url)
# getting the response from the page using get method of requests module
page = requests.get(url, headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36"})
#print(page.status_code)
# storing the content of the page in a variable
html = page.content
# creating BeautifulSoup object
page_soup = BeautifulSoup(html, "html.parser")
#print(page_soup.prettify())
for containers in page_soup.findAll('div', {'class': 'styles__ProductsListItem-vrexg1-2 gkrzX'}):
name = containers.find('label', attrs={'class': 'styles__Label-sc-1x6v2mz-2 gmFpMA label'})
price = containers.find('span', attrs={'class': 'styles__PriceValue-sc-33rfvt-10 fVFAzY'})
man_it_num = containers.find('div', attrs={'title': 'Indeks producenta'})
formatted_name = name.text.replace('Dodaj do koszyka: ', '')
products.append(formatted_name)
prices.append(price.text)
MIN.append(man_it_num.text)
df = pd.DataFrame({'Product Name': products, 'Price': prices, 'MIN': MIN})
time.sleep(random.randint(2, 11))
#df.to_excel('output.xlsx', sheet_name='Kable i przewody')
Because Total pages loaded dynamically via API. So to get all data, you have to use API.
Example:
import pandas as pd
import requests
api_url = 'https://onninen.pl/api/search?query=/Kable-i-przewody/strona:{p}'
headers = {
'user-agent': 'Mozilla/5.0',
'referer': 'https://onninen.pl/produkty/Kable-i-przewody?query=/strona:2',
'cookie': '_gid=GA1.2.1022119173.1663690794; _fuid=60a315c76d054fd5add850c7533f529e; _gcl_au=1.1.1522602410.1663690804; pollsvisible=[]; smuuid=1835bb31183-22686567c511-4116ddce-c55aa071-2639dbd6-ec19e64a550c; _smvs=DIRECT; poll_random_44=1; poll_visited_pages=2; _ga=GA1.2.1956280663.1663690794; smvr=eyJ2aXNpdHMiOjEsInZpZXdzIjo3LCJ0cyI6MTY2MzY5MjU2NTI0NiwibnVtYmVyT2ZSZWplY3Rpb25CdXR0b25DbGljayI6MCwiaXNOZXdTZXNzaW9uIjpmYWxzZX0=; _ga_JXR5QZ2XSJ=GS1.1.1663690794.1.1.1663692567.0.0.0'
}
dfs = []
for p in range(1,28):
d=requests.get(api_url.format(p=p),headers=headers).json()['items'][0]['items']
df = pd.DataFrame(d)
dfs.append(df)
df = pd.concat(dfs)
print(df)
Output:
id slug index catalogindex ... onntopcb isnew qc ads
0 147774 KABLE-ROZNE-MARKI-Kabel-energetyczny-YKY-ZO-3x... HES890 112271067D0500 ... 0 False None None
1 45315 KABLE-ROZNE-MARKI-Kabel-energetyczny-YKY-ZO-3x... HES893 112271068D0500 ... 0 False None None
2 169497 KABLE-ROZNE-MARKI-Kabel-energetyczny-YKY-ZO-3x... HES896 112271069D0500 ... 0 False None None
3 141820 KABLE-ROZNE-MARKI-Kabel-energetyczny-YKY-ZO-4x... HES900 112271056D0500 ... 0 False None None
4 47909 KABLE-ROZNE-MARKI-Kabel-energetyczny-YKY-ZO-4x... HES903 112271064D0500 ... 0 False None None
.. ... ... ... ... ... ... ... ... ...
37 111419 NVENT-RAYCHEM-Kabel-grzejny-EM2-XR-samoreguluj... HDZ938 449561-000 ... 0 True None None
38 176526 NVENT-RAYCHEM-Przewod-stalooporowy-GM-2CW-35m-... HEA099 SZ18300102 ... 0 False None None
39 38484 DEVI-Mata-grzewcza-DEVIheat-150S-150W-m2-375W-... HAJ162 140F0332 ... 1 False None None
40 60982 DEVI-Mata-grzewcza-DEVImat-150T-150W-m2-375W-0... HAJ157 140F0448 ... 1 False None None
41 145612 DEVI-Czujnik-Devireg-850-rynnowy-czujnik-140F1... HAJ212 140F1086 ... 0 False None None
[1292 rows x 27 columns]
I am trying to scrape a webpage Realtor and I am succesful in doing so by using Requests, BS4 but the main problem is sometimes it returns me 1 or sometimes 2 depending if the item is present in listing or not. Both of these items have same tag Div and class name so I can't differentiate them.
My code is below:
import requests
from bs4 import BeautifulSoup
import pandas as pd
html = requests.get('https://www.realtor.com/realestateagents/84664/pg-1')
doc = BeautifulSoup(html.text,'html.parser')
names = []
contacts = []
for_sale = []
sold = []
price_range = []
last_listing_date = []
for box in doc.find_all('div', class_='jsx-3970352998 agent-list-card clearfix'):
names.append(box.find('div', class_='jsx-3970352998 agent-name text-bold').text)
try:
contacts.append(box.find('div', class_='jsx-3970352998 agent-phone hidden-xs hidden-xxs'))
except IndexError:
contacts.append('No contact number found')
property_data = box.find_all('div', class_='jsx-3970352998 agent-detail-item ellipsis')
try:
for_sale.append(property_data[0].span.text)
except:
for_sale.append('None')
try:
sold.append(property_data[1].span.text)
except:
sold.append('0')
price_activity = box.find_all('div', class_='jsx-3970352998 second-column col-lg-6 no-padding')
a = price_activity[0].find_all('div', class_='jsx-3970352998 agent-detail-item')
print(len(a))
try:
price_range.append(a[0].span.text)
print(a[0].span.text)
except IndexError:
print('No activity range found')
price_range.append('No activity range found')
try:
print(a[1].span.text)
last_listing_date.append(a[1].span.text)
except IndexError:
print('No listing data found')
last_listing_date.append('No listing data found')
df = pd.DataFrame(data={'Name':names, 'Contact':contacts, 'Active Listings':for_sale, 'Properties Sold':sold,
'Price Range':price_range, 'Last Listing Date':last_listing_date})
df
And this is my output, you can see I have highlighted with yellow the the values which are getting into wrong column, becaue some listings dont have Activity Range so they only return one thing which is Last Listing Date and my current code is not able to handle it and I am not sure how to tackle this problem. In desired output, they should be in a place where I marked as red dots.
My output
It seems to be that the element locator strategy was not in proper way.
import requests
from bs4 import BeautifulSoup
import pandas as pd
url='https://www.realtor.com/realestateagents/84664/pg-{page}'
data =[]
for page in range(1,6):
req = requests.get(url.format(page=page))
soup = BeautifulSoup(req.text,'html.parser')
for card in soup.select('div.cardWrapper > ul > div'):
names = card.select_one('div[class="jsx-3970352998 agent-name text-bold"]').text
contacts = card.select_one('div[class="jsx-3970352998 agent-group text-semibold ellipsis"]').get_text(strip=True)
for_sale = card.select_one('div[class="jsx-3970352998 agent-detail-item ellipsis"]:nth-child(1) > span').text
sold = card.select_one('div[class="jsx-3970352998 agent-detail-item ellipsis"]:nth-child(1) > span').text
price = card.select_one('div:-soup-contains("Activity range") > span')
price_range = price.text if price else None
date = card.select_one('div:-soup-contains("Listed a house") > span')
last_listing_date = date.text if date else None
data.append({
'names':names,
'contacts':contacts,
'for_sale':for_sale,
'sold':sold,
'price_range':price_range,
'last_listing_date':last_listing_date
})
df = pd.DataFrame(data)
print(df)
Output:
names contacts ... price_range last_listing_date
0 Clint Allred Kw South Valley Keller Williams ... $370K - $1.08M 2022-08-18
1 Martha McMullin The Group Real Estate, LLC ... $495K - $995K 2022-08-18
2 Aren Bybee R and R Realty, LLC ... $115K - $2.49M 2022-08-18
3 Kenny ParcellTeam Equity Real Estate - Utah ... $125K - $1.2M 2022-08-17
4 Eric MossTeam Equity Real Estate - Utah ... $125K - $600K 2022-08-17
.. ... ... ... ... ...
95 Marny Schlopy Coldwell Banker Realty ... $410K - $756K None
96 Amy Laster-Haynes Better Homes and Gardens Real Estate Momentum ... $364K - $2.62M None
97 Raquel Jex Presidio Real Estate Company ... $442K - $442K None
98 Kelly Ercanbrack Unite Real Estate ... $400K - $400K None
99 Camie Jefferies Equity Real Estate - Tooele ... None Reported None
[100 rows x 6 columns]
You should be able to get the data you're looking for like this:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
url = "https://kfcsg.cognizantorderserv.com/nutrition-allergen"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(1, 12)):
r = s.get(f'https://www.realtor.com/realestateagents/84664/pg-{x}', headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
agent_cards = soup.select('div[data-testid="component-agentCard"]')
for a in agent_cards:
name = a.select_one('div.agent-name').get_text(strip=True)
company = a.select_one('div.agent-group').get_text(strip=True)
try:
phone = a.select_one('div.agent-phone').get_text(strip=True)
except Exception as e:
phone = 'Phoneless'
try:
experience = a.select_one('div#agentExperience').get_text(strip=True)
except Exception as e:
experience = 'Quite inexperienced'
try:
h_for_sale = a.select_one('span.sale-sold-count').get_text(strip=True)
except Exception as e:
h_for_sale = 0
big_list.append((name, company, phone, experience, h_for_sale))
df = pd.DataFrame(big_list, columns = ['Name', 'Company', 'Phone', 'Experience', 'For sale'])
print(df)
Result:
Name
Company
Phone
Experience
For sale
0
Martha McMullin
The Group Real Estate, LLC
(303) 638-1033
Experience:8 years
2
1
Aren Bybee
R and R Realty, LLC
(801) 210-1461
Experience:22 years 2 months
31
2
Kenny ParcellTeam
Equity Real Estate - Utah
(801) 794-7777
Experience:26 years 7 months
24
3
Eric MossTeam
Equity Real Estate - Utah
(801) 669-0383
Experience:10 years 5 months
10
4
Chantelle Rees
Equity Real Estate - Results
(801) 636-2515
Quite inexperienced
4
[...]
Using the logic above, you can obtain other info as well and include it into dataframe. BeautifulSoup docs: https://beautiful-soup-4.readthedocs.io/en/latest/index.html
Also, TQDM: https://pypi.org/project/tqdm/
I am currently trying to scrape store location for research project that is aiming to show the effect covid had on different retailers. The retailer I am having issue with currently is " The source". It's a Canadian retailer that has a large amount of store across Canada and has store that are generally small when compared to Best Buy. The store locator page is: https://www.thesource.ca/en-ca/store-finder
The goal for this code is to have a excel file with column of address, postal code and phone number.( I just assume use pandas for this) Those three are also the data I wanna scrape. The code I wrote so far I think is on the right track, the information for the most part is under a table. However I am struggling to get to the 'li' tags and it to loop through the different rows of table. If anyone has a idea on how I would grab the 'li' tags for each of data I want that would be great!
import requests
from bs4 import BeautifulSoup
url = 'https://www.thesource.ca/en-ca/store-finder'
r = requests.get(url)
soup = BeautifulSoup (r,text,'htmlparser')
Locations_table = soup.find('table', class_='storeResultList store-result-list desktop-only')
for locations in Locations_table.find_all('tbody'):
rows = locations.('tr', class_= 'storeItem store-result-row')
for row in rows:
address = row.find('td', class_ ='address')
# trying to get address
# postal
# phone number which I think is not under this table
print(Locations_table)
We are coding according to logic! Once you've a logic so you can parse towards it!
Logic here is that almost of addresses length is 6, where the messed addresses length is 5. so we can clear it up.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from more_itertools import collapse
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
def main(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
goal = [list(x.stripped_strings)
for x in soup.select_one('.storeResultList').select('.address')[1:]]
allin = []
for x in goal:
if len(x) == 5:
x.insert(2, 'N/A')
x[3] = x[3].rsplit(",", 1)
allin.append(list(collapse(x)))
df = pd.DataFrame(
allin, columns=["Name", "Address", "Unit", "City", "State", "Zip", "Phone"])
df.to_csv('data.csv', index=False)
main('https://www.thesource.ca/en-ca/store-finder')
Output:
Name Address Unit City State Zip Phone
0 Optimist Square 4725 Dorchester Rd Unit #B10 NIAGARA FALLS ON L2E 0A8 905-356-0772
1 SEAWAY MALL 800 NIAGARA ST N UNIT #K12 WELLAND ON L3C5Z4 905-735-2136
2 PEN CENTRE 221 GLENDALE AVE N/A ST CATHARINES ON L2T2K9 905-684-1456
3 GRIMSBY SQUARE SC 44 Livingston Ave. Unit #1006A GRIMSBY ON L3M1L1 905-945-9415
4 J & R SPORTS LTD 151 QUEEN ST N/A DUNNVILLE ON N1A1H6 905-774-8872
.. ... ... ... ... ... ... ...
95 KINGSVILLE MAIN ST 410 MAIN ST E UNIT #3/4 KINGSVILLE ON N9Y 1A7 519-733-4138
96 ST. CLAIR SHORES S/C 25 AMY CROFT DRIVE UNIT #15 WINDSOR ON N9K1C7 519-735-5364
97 TECUMSEH MALL D2-7650 TECUMSEH RD E N/A WINDSOR ON N8T1E9 519-974-1421
98 DEVONSHIRE MALL 3100 HOWARD AVE UNIT #SS5 WINDSOR ON N8X3Y8 519-969-2099
99 PLAYIT STAR 105 HENRY STREET WEST N/A PRESCOTT ON K0E1T0 613-925-0776
[100 rows x 7 columns]
To select different li's you can use the :nth-of-type(n) CSS selector.
To use a CSS selector, use the select_one() method instead of .find().
Note:
I added the user-agent header since the page was stuck on loading.
In your example:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.thesource.ca/en-ca/store-finder"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
out = {"Address": [], "Postal": [], "Phone": []}
for tag in soup.select(".details"):
out["Address"].append(tag.select_one("li:nth-of-type(1)").get_text(strip=True))
out["Postal"].append(
tag.select_one("li:last-of-type").get_text(strip=True)
)
out["Phone"].append(tag.select_one("a.tel-link").get_text(strip=True))
df = pd.DataFrame(out)
print(df.to_string())
Output (truncated):
Address Postal Phone
0 4725 Dorchester Rd L2E 0A8 905-356-0772
1 800 NIAGARA ST N L3C5Z4 905-735-2136
2 221 GLENDALE AVE L2T2K9 905-684-1456
3 44 Livingston Ave. L3M1L1 905-945-9415
4 151 QUEEN ST N1A1H6 905-774-8872
You are close: each row object produced by iterating over BeautifulSoup.select('tr.storeItem.store-result-row') can be further selected from to get the li values. In the solution below, a function is used to take in each row and extract the results:
import requests, pandas as pd
from bs4 import BeautifulSoup as soup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'}
d = soup(requests.get('https://www.thesource.ca/en-ca/store-finder', headers=headers).text, 'html.parser')
def store_info(row):
return {'store':row.select_one('td.address .itemName').get_text(strip=True),
'address':', '.join((j:=list(filter(None, [i.text for i in row.select('td.address ul li')])))[:-1]),
'postal_code':j[-1],
'phone':row.select_one('td.address .tel-link').get_text(strip=True)}
results = [store_info(row) for row in d.select('table:nth-of-type(1) tr.storeItem.store-result-row')]
df = pd.DataFrame(results)
Output:
store address postal_code phone
0 Optimist Square 4725 Dorchester Rd, Unit #B10, NIAGARA FALLS, ON L2E 0A8 905-356-0772
1 SEAWAY MALL 800 NIAGARA ST N, UNIT #K12, WELLAND, ON L3C5Z4 905-735-2136
2 PEN CENTRE 221 GLENDALE AVE, ST CATHARINES, ON L2T2K9 905-684-1456
3 GRIMSBY SQUARE SC 44 Livingston Ave., Unit #1006A, GRIMSBY, ON L3M1L1 905-945-9415
4 J & R SPORTS LTD 151 QUEEN ST, DUNNVILLE, ON N1A1H6 905-774-8872
.. ... ... ... ...
95 KINGSVILLE MAIN ST 410 MAIN ST E, UNIT #3/4, KINGSVILLE, ON N9Y 1A7 519-733-4138
96 ST. CLAIR SHORES S/C 25 AMY CROFT DRIVE, UNIT #15, WINDSOR, ON N9K1C7 519-735-5364
97 TECUMSEH MALL D2-7650 TECUMSEH RD E, WINDSOR, ON N8T1E9 519-974-1421
98 DEVONSHIRE MALL 3100 HOWARD AVE, UNIT #SS5, WINDSOR, ON N8X3Y8 519-969-2099
99 PLAYIT STAR 105 HENRY STREET WEST, PRESCOTT, ON K0E1T0 613-925-0776
[100 rows x 4 columns]
I am currently creating different projects to grasp the concept of web scraping. Currently I am trying to create a database of items from a shoe selling site, but I cant seem to get the data in text form.
I have tried.
from selenium import webdriver
import time
import requests
from bs4 import BeautifulSoup
import numpy
import statistics
import pandas as pd
offset=0
driver=webdriver.Chrome()
listo=[]
while True:
driver.get("https://stockx.com/sneakers?page={offset}".format(offset=offset))
time.sleep(10)
main_div=driver.find_elements_by_xpath('//*[#id="main-content"]/div[2]/div[2]/div/div')
for div in main_div:
links=div.find_elements_by_tag_name("a")
for link in links:
namer=(link.get_attribute('href'))
print(namer)
offset+=0.05
listo.append(namer)
namelist = sorted(set(listo))
for hreflink in namelist:
hreflinks=(hreflink)
driver.get(hreflinks)
time.sleep(10)
LastsaleD=driver.find_elements_by_xpath('//[#id="marketsummary"]/div[2]/div/div[1]
/div[1]')
print(LastsaleD).text
if offset>30:
break
Using Selenium is overkill and less efficient here. The data is found in json format within the <script> tags of the source html. Just do a simple request of the site, pull out the relevant <script> with the json, then parse the json into rows to put into a table.
Also, why increment offset+=0.05? I understand your logic of adding it for ever 20 items on the page, but why not just increment by 1 after that loop through the 20 items? What happens if for whatever reason you get 19 items returned or 21 items? Then your increments will be off for the rest of the loop.
Anyways, here's the code. This will get you going.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'}
at_end = False
offset = 0
rows = []
while at_end == False:
offset+=1
url = "https://stockx.com/sneakers?page={offset}".format(offset=offset)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script', {'type':'application/ld+json'})
for script in scripts:
jsonMatch = re.compile("{.*}")
jsonStr = jsonMatch.search(str(script))[0]
jsonData = json.loads(jsonStr)
if jsonData['#type'] == 'OfferCatalog':
break
listings = jsonData['itemListElement']
for listing in listings:
item = listing['item']
offers = item.pop('offers')
item.update(offers)
if item not in rows:
rows.append(item)
else:
at_end = True
continue
print('Page: %s' %offset)
df = pd.DataFrame(rows)
Output:
print(df)
#type brand ... highPrice priceCurrency
0 AggregateOffer Jordan ... 165 GBP
1 AggregateOffer Jordan ... 226 GBP
2 AggregateOffer Jordan ... 321 GBP
3 AggregateOffer Jordan ... 159 GBP
4 AggregateOffer Jordan ... 190 GBP
.. ... ... ... ... ...
495 AggregateOffer Nike ... 230 GBP
496 AggregateOffer New Balance ... 159 GBP
497 AggregateOffer Nike ... 152 GBP
498 AggregateOffer Nike ... 162 GBP
499 AggregateOffer Nike ... 167 GBP
[500 rows x 14 columns]