I want to scrape these product information - python

import requests
from bs4 import BeautifulSoup
import pandas as pd
baseurl='https://twillmkt.com'
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
r =requests.get('https://twillmkt.com/collections/denim')
soup=BeautifulSoup(r.content, 'html.parser')
tra = soup.find_all('div',class_='ProductItem__Wrapper')
productlinks=[]
Title=[]
Brand=[]
Colour=[]
for links in tra:
for link in links.find_all('a',href=True)[1:]:
comp=baseurl+link['href']
productlinks.append(comp)
for link in productlinks:
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
title=soup.find('h1').text
Title.append(title)
price=soup.find('span',class_="money").text
Brand.append(price)
Product_Features=soup.find_all('li').text
Colour.append(Product_Features)
df = pd.DataFrame(
{"Title": Title, "Price": Brand,"Product_Features":Colour}
)
print(df)
I scrape the title and price but difficult to scrape these information SIZE,product feature,material,model size,image
Single Page Link
https://twillmkt.com/products/light-blue-butterfly-print-slim-leg-denim?variant=39498848403534

i'll give you the css selectors for the elements you want. you can catch them with soup.select()
size: select[id^="product-select"] option
product feature: div.ProductMeta__Description.Rte li
material: div.ProductMeta__Description.Rte p:nth-of-type(2)
model size: div.ProductMeta__Description.Rte p:nth-of-type(3)
image: div[id^="Image"] span img

Related

Selecting links within a div tag using beautiful soup

I am trying to run the following code
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
params = {
'q': 'Machine learning,
'hl': 'en'
}
html = requests.get('https://scholar.google.com/scholar', headers=headers,
params=params).text
soup = BeautifulSoup(html, 'lxml')
for result in soup.select('.gs_r.gs_or.gs_scl'):
profiles=result.select('.gs_a a')['href']
The following output (error) is being shown
"TypeError: list indices must be integers or slices, not str"
What is it I am doing wrong?
The following is tested and works:
import requests
from bs4 import BeautifulSoup as bs
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
params = {
'q': 'Machine learning',
'hl': 'en'
}
html = requests.get('https://scholar.google.com/scholar', headers=headers,
params=params).text
soup = bs(html, 'lxml')
for result in soup.select('.gs_r.gs_or.gs_scl'):
profiles=result.select('.gs_a a')
for p in profiles:
print(p.get('href'))
Result in terminal:
/citations?user=rSVIHasAAAAJ&hl=en&oi=sra
/citations?user=MnfzuPYAAAAJ&hl=en&oi=sra
/citations?user=09kJn28AAAAJ&hl=en&oi=sra
/citations?user=yxUduqMAAAAJ&hl=en&oi=sra
/citations?user=MnfzuPYAAAAJ&hl=en&oi=sra
/citations?user=9Vdfc2sAAAAJ&hl=en&oi=sra
/citations?user=lXYKgiYAAAAJ&hl=en&oi=sra
/citations?user=xzss3t0AAAAJ&hl=en&oi=sra
/citations?user=BFdcm_gAAAAJ&hl=en&oi=sra
/citations?user=okf5bmQAAAAJ&hl=en&oi=sra
/citations?user=09kJn28AAAAJ&hl=en&oi=sra
In your code, you were trying to obtain the href attribute from a list (soup.select returns a list, and soup.select_one return a single element).
See BeautifulSoup documentation here

Create Rows and Columns in BeautifulSoup

Below is code python code output.I want output in rows and column in dataframe:
response = requests.get(source_data)
soup = BeautifulSoup(response.text, "html.parser")
States = soup.find_all('div',class_ = 'card bg-darker p-3 mb-3')
for item in States :
state_name = item.find(class_='fw-bold fs-5 mb-2').text
vaccinated_per = item.find(class_='col-3 text-end fs-5 ff-s text-success').text
print(state_name,vaccinated_per)
Output:
Flanders 80.24%
Wallonia 70.00%
Brussels 56.73%
Ostbelgien 65.11%
Collect your information in a list of dicts and then simply create a data frame from it:
data = []
for item in States :
data.append({
'state_name' : item.find(class_='fw-bold fs-5 mb-2').text,
'vaccinated_per' : item.find(class_='col-3 text-end fs-5 ff-s text-success').text
})
pd.DataFrame(data)
Example
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
response = requests.get('https://covid-vaccinatie.be/en', headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
States = soup.find_all('div',class_ = 'card bg-darker p-3 mb-3')
data = []
for item in States :
data.append({
'state_name' : item.find(class_='fw-bold fs-5 mb-2').text,
'vaccinated_per' : item.find(class_='col-3 text-end fs-5 ff-s text-success').text
})
pd.DataFrame(data)
Output
state_name vaccinated_per
0 Flanders 80.24%
1 Wallonia 70.00%
2 Brussels 56.73%
3 Ostbelgien 65.11%

Web scraper returning zero in terminal

I'm trying to scrape this website
https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who=bygg&bf=1&page=1
And I've put a def getQuestions(tag) in the who={tag} part of the url and that works fine. When I try to add def getQuestions(tag, page) page={page} it just returns 0 in the terminal, and I really hope no clue what could be causing this.
Here is the full code:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'}
questionlist = []
def getQuestions(tag, page):
url = 'https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={bygg}&bf=1&page={page}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
questions = soup.find_all('div', {'class': 'box-white p-0 mb-4'})
for item in questions:
question = {
'title': item.find('a', {'class': 'link-primary'}).text,
'link': item.find('a', {'class': 'link-primary'})['href'],
'nummer': item.find('a', {'class': 'link-body'})['href'],
'address': item.find('address', {'class': 'mt-2 mb-0'}).text,
'RegÅr': item.find('div', {'class': 'col text-center'}).text,
}
questionlist.append(question)
return
for x in range(1,5):
getQuestions('bygg', x)
print(len(questionlist))
Any help would be appreciated. Best regards!
Change the string in url variable to f-string:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
}
def getQuestions(tag, page):
questionlist = []
url = f"https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={tag}&bf=1&page={page}"
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
questions = soup.find_all("div", {"class": "box-white p-0 mb-4"})
for item in questions:
question = {
"title": item.find("a", {"class": "link-primary"}).text,
"link": item.find("a", {"class": "link-primary"})["href"],
"nummer": item.find("a", {"class": "link-body"})["href"],
"address": item.find("address", {"class": "mt-2 mb-0"}).text,
"RegÅr": item.find("div", {"class": "col text-center"}).text,
}
questionlist.append(question)
return questionlist
out = []
for x in range(1, 5):
out.extend(getQuestions("bygg", x))
print(len(out))
Prints:
80
Try changing your url to this:
url = f'https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={tag}&bf=1&page={page}'
You didn't quite have your f-Strings set up right

Pick only one number from an html page with beatifulsoup

I have this url from coronavirus worldwide and I would like to pick only one number, the newcases in Arizona which is +2383 right now.
import requests
from bs4 import BeautifulSoup
import lxml
url = "https://www.worldmeter.com/coronavirus/us/"
page = requests.get("https://www.worldmeter.com/coronavirus/us/")
soup = BeautifulSoup(page.content, "lxml")
page.close()
newcases = soup.find('a', href_="https://worldmeter.com/coronavirus/arizona", class_="tableRowLinkYellow newCasesStates").get_text(strip=True)
print(newcases)
I get this error:
AttributeError: 'NoneType' object has no attribute 'get_text'
How do I pick only that number from the whole table? Thank you for your time.
Just like Linh said, it was generated by Javascript.Using selenium is an easy way but not efficient enough.(too slow)
You could scrape the API directly:
import requests
url = "https://worldmeter.com/coronavirus/wp-admin/admin-ajax.php?action=wp_ajax_ninja_tables_public_action&table_id=2582&target_action=get-all-data&default_sorting=old_first"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
}
results = requests.get(url, headers=headers).json()
for result in results:
if result["state_name"] == "Arizona":
print(result)
print("The newcases is", result["new_cases"])
And this gave me:
{'state_name': 'Arizona', 'positive': '275,436', 'new_cases': '2,383', 'death_in_states': '6,302', 'new_deaths': '2', 'recovered_states': '45,400', 'new_recovered': '364', 'totaltestresults': 'Arizona', 'postname': 'arizona', 'cases_100_k_population': '3,866.37', 'state_population': '7278717', 'death_100_k_population': '88.46'}
The newcases is 2,383

Scraping with requests

what is wrong in my code, I try get the same content like in https://koleo.pl/rozklad-pkp/krakow-glowny/radom/19-03-2019_10:00/all/EIP-IC--EIC-EIP-IC-KM-REG but result is diffrent as I want to have.
import requests
from bs4 import BeautifulSoup
s = requests.Session()
s.headers.update({"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'})
response=s.get('https://koleo.pl/rozklad-pkp/krakow-glowny/radom/19-03-
2019_10:00/all/EIP-IC--EIC-EIP-IC-KM-REG')
soup=BeautifulSoup(response.text,'lxml')
print(soup.prettify())
You can use requests and pass params in to get json for the train info and prices. I haven't parsed out all the info as this is just to show you it is possible. I parse out the train ids to be able to make the subsequent requests from price info which are linked by ids to the train info
import requests
from bs4 import BeautifulSoup as bs
url = 'https://koleo.pl/pl/connections/?'
headers = {
'Accept' : 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding' : 'gzip, deflate, br',
'Accept-Language' : 'en-US,en;q=0.9',
'Connection' : 'keep-alive',
'Cookie' : '_ga=GA1.2.2048035736.1553000429; _gid=GA1.2.600745193.1553000429; _gat=1; _koleo_session=bkN4dWRrZGx0UnkyZ3hjMWpFNGhiS1I3TzhQMGNyWitvZlZ0QVRUVVVtWUFPMUwxL0hJYWJyYnlGTUdHYXNuL1N6QlhHMHlRZFM3eFZFcjRuK3ZubllmMjdSaU5CMWRBSTFOc1JRc2lDUGV0Y2NtTjRzbzZEd0laZWI1bjJoK1UrYnc5NWNzZzNJdXVtUlpnVE15QnRnPT0tLTc1YzV1Q2xoRHF4VFpWWTdWZDJXUnc9PQ%3D%3D--3b5fe9bb7b0ce5960bc5bd6a00bf405df87f8bd4',
'Host' : 'koleo.pl',
'Referer' : 'https://koleo.pl/rozklad-pkp/krakow-glowny/radom/19-03-2019_10:00/all/EIP-IC--EIC-EIP-IC-KM-REG',
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
'X-CSRF-Token' : 'heag3Y5/fh0hyOfgdmSGJBmdJR3Perle2vJI0VjB81KClATLsJxFAO4SO9bY6Ag8h6IkpFieW1mtZbD4mga7ZQ==',
'X-Requested-With' : 'XMLHttpRequest'
}
params = {
'v' : 'a0dec240d8d016fbfca9b552898aba9c38fc19d5',
'query[date]' : '19-03-2019 10:00:00',
'query[start_station]' : 'krakow-glowny',
'query[end_station]': 'radom',
'query[brand_ids][]' : '29',
'query[brand_ids][]' : '28',
'query[only_direct]' : 'false',
'query[only_purchasable]': 'false'
}
with requests.Session() as s:
data= s.get(url, params = params, headers = headers).json()
print(data)
priceUrl = 'https://koleo.pl/pl/prices/{}?v=a0dec240d8d016fbfca9b552898aba9c38fc19d5'
for item in data['connections']:
r = s.get(priceUrl.format(item['id'])).json()
print(r)
You have to use selenium in order to get that dynamically generated content. And then you can parse html with BS. For example I've parsed dates:
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Firefox()
driver.get('https://koleo.pl/rozklad-pkp/krakow-glowny/radom/19-03-2019_10:00/all/EIP-IC--EIC-EIP-IC-KM-REG')
soup = BeautifulSoup(driver.page_source, 'lxml')
for div in soup.findAll("div", {"class": 'date custom-panel'}):
date = div.findAll("div", {"class": 'row'})[0].string.strip()
print(date)
Output:
wtorek, 19 marca
środa, 20 marca

Categories

Resources