How to extract data in columns from page using soup - python

Trying to capture data which is present in bullet points
link https://www.redbook.com.au/cars/details/2019-honda-civic-50-years-edition-auto-my19/SPOT-ITM-524208/
need to extract the data using xpath here
data to be extracted
4 Door Sedan
4 Cylinder, 1.8 Litre
Constantly Variable Transmission, Front Wheel Drive
Petrol - Unleaded ULP
6.4 L/100km
tried this :
import requests
import lxml.html as lh
import pandas as pd
import html
from lxml import html
from bs4 import BeautifulSoup
import requests
cars = []
urls = ['https://www.redbook.com.au/cars/details/2019-honda-civic-50-years-edition-auto-my19/SPOT-ITM-524208/']
for url in urls:
car_data={}
headers = {'User-Agent':'Mozilla/5.0'}
page = (requests.get(url, headers=headers))
tree = html.fromstring(page.content)
if tree.xpath('/html/body/div[1]/div[2]/div/div[1]/div[1]/div[4]/div/div'):
car_data["namings"] = tree.xpath('/html/body/div[1]/div[2]/div/div[1]/div[1]/div[4]/div/div')[0]

You've imported BeautifulSoup so why not use css class selector?
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.redbook.com.au/cars/details/2019-honda-civic-50-years-edition-auto-my19/SPOT-ITM-524208/', headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.content, 'lxml')
info = [i.text.strip() for i in soup.select('.dgi-')]
You could also print as
for i in soup.select('.dgi-'):
print(i.text.strip())

find_all()-returns a collection of elements.
strip()- in-built function of Python is used to remove all the leading and trailing spaces from a string.
Ex.
import requests
from bs4 import BeautifulSoup
cars = []
urls = ['https://www.redbook.com.au/cars/details/2019-honda-civic-50-years-edition-auto-my19/SPOT-ITM-524208/']
for url in urls:
car_data=[]
headers = {'User-Agent':'Mozilla/5.0'}
page = (requests.get(url, headers=headers))
soup = BeautifulSoup(page.content,'lxml')
car_obj = soup.find("div",{'class':'r-center-pane'}).find("div",\
{'class':'micro-spec'}).find("div",{'class':'columns'}).find_all("dd")
for x in car_obj:
text = x.text.strip()
if text != "":
car_data.append(text)
cars.append(car_data)
print(cars)
O/P:
[['4 Door Sedan', '4 Cylinder, 1.8 Litre', 'Constantly Variable Transmission,
Front Wheel Drive', 'Petrol - Unleaded ULP', '6.4 L/100km']]

Related

Having trouble extracting the URL from a website

So i want to extract url for all the buttons on the sidebar, but I can't seem to get past the first one, and I dont know why or how to fix it. Unfortunately, this is for an assignment so I cant import anything else.
This is the code I tried
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://books.toscrape.com/"
genres = ["Travel", "Mystery", "Historical Fiction", "Sequential Art", "Classics", "Philosophy"]
# write your code below
response=requests.get(url, timeout=3)
soup = BeautifulSoup(response.content, 'html.parser')
sidebar=soup.find_all('div',{'class':'side_categories'})
for a in sidebar:
genre_url=a.find('a').get('href')
print(genre_url)
I got
catalogue/category/books_1/index.html
I was expecting
catalogue/category/books_1/index.html
catalogue/category/books/travel_2/index.html
catalogue/category/books/mystery_3/index.html
catalogue/category/books/historical-fiction_4/index.html
catalogue/category/books/sequential-art_5/index.html
catalogue/category/books/classics_6/index.html
...
I used the following CSS selector to find all the tags from the sidebar: .side_categories>ul>li>ul>li>a
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://books.toscrape.com/"
genres = ["Travel", "Mystery", "Historical Fiction", "Sequential Art", "Classics", "Philosophy"]
# write your code below
response=requests.get(url, timeout=3)
soup = BeautifulSoup(response.content, 'html.parser')
genre_url_elems = soup.select(".side_categories>ul>li>ul>li>a")
genre_urls = [e['href'] for e in genre_url_elems]
for url in genre_urls:
print(url)
Here's the output:
catalogue/category/books/travel_2/index.html
catalogue/category/books/mystery_3/index.html
catalogue/category/books/historical-fiction_4/index.html
catalogue/category/books/sequential-art_5/index.html
catalogue/category/books/classics_6/index.html
catalogue/category/books/philosophy_7/index.html
catalogue/category/books/romance_8/index.html
catalogue/category/books/womens-fiction_9/index.html
catalogue/category/books/fiction_10/index.html
catalogue/category/books/childrens_11/index.html
catalogue/category/books/religion_12/index.html
catalogue/category/books/nonfiction_13/index.html
catalogue/category/books/music_14/index.html
catalogue/category/books/default_15/index.html
catalogue/category/books/science-fiction_16/index.html
catalogue/category/books/sports-and-games_17/index.html
catalogue/category/books/add-a-comment_18/index.html
catalogue/category/books/fantasy_19/index.html
catalogue/category/books/new-adult_20/index.html
catalogue/category/books/young-adult_21/index.html
catalogue/category/books/science_22/index.html
catalogue/category/books/poetry_23/index.html
catalogue/category/books/paranormal_24/index.html
catalogue/category/books/art_25/index.html
catalogue/category/books/psychology_26/index.html
catalogue/category/books/autobiography_27/index.html
catalogue/category/books/parenting_28/index.html
catalogue/category/books/adult-fiction_29/index.html
catalogue/category/books/humor_30/index.html
catalogue/category/books/horror_31/index.html
catalogue/category/books/history_32/index.html
catalogue/category/books/food-and-drink_33/index.html
catalogue/category/books/christian-fiction_34/index.html
catalogue/category/books/business_35/index.html
catalogue/category/books/biography_36/index.html
catalogue/category/books/thriller_37/index.html
catalogue/category/books/contemporary_38/index.html
catalogue/category/books/spirituality_39/index.html
catalogue/category/books/academic_40/index.html
catalogue/category/books/self-help_41/index.html
catalogue/category/books/historical_42/index.html
catalogue/category/books/christian_43/index.html
catalogue/category/books/suspense_44/index.html
catalogue/category/books/short-stories_45/index.html
catalogue/category/books/novels_46/index.html
catalogue/category/books/health_47/index.html
catalogue/category/books/politics_48/index.html
catalogue/category/books/cultural_49/index.html
catalogue/category/books/erotica_50/index.html
catalogue/category/books/crime_51/index.html
For more, read about 'CSS selectors': https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors
Here you go:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://books.toscrape.com/"
genres = ["Travel", "Mystery", "Historical Fiction", "Sequential Art", "Classics", "Philosophy"]
# write your code below
response=requests.get(url, timeout=3)
soup = BeautifulSoup(response.content, 'html.parser')
# sidebar=soup.find_all('div',{'class':'side_categories'})
sidebar=soup.find_all('a',href=True)
for link in sidebar:
url = link['href']
if 'catalogue' in url:
print(url)

How to scrape review to dataframe

I would like to scratch the reviews from this page and save them as a data frame, but I do not download star ratings and the text of the review. Just only text. What i did wrong?
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.morele.net/pralka-candy-cs4-1062d3-950636/?sekcja=reviews-all")
soup = BeautifulSoup(page.content, "html.parser",
).find_all("div", {"class":"reviews-item"})
# print(soup)
morele = [div.getText(strip=True) for div in soup]
print(morele)
csv_table = pd.DataFrame(morele)
csv_table = csv_table.reset_index(drop=True)
csv_table.insert(0,'No.',csv_table.index)
You are mostly there - just further navigate the DOM and you can get just the text.
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.morele.net/pralka-candy-cs4-1062d3-950636/?sekcja=reviews-all")
soup = BeautifulSoup(page.content, "html.parser",)
data = [{"text":ri.find("div", {"class":"rev-desc"}).getText(strip=True) ,
"stars":ri.find("div", {"class":"rev-stars"}).getText(strip=True)}
for ri in soup.find_all("div", {"class":"reviews-item"})
]
pd.DataFrame(data)

Get the descriptions of product details in the script by BeautifulSoup and json

I'm getting descriptions of product details in the tag from the web.
Here's the code :
import re
import json
from bs4 import BeautifulSoup
import requests
url = 'https://oldnavy.gap.com/browse/product.do?pid=599211032&rrec=true&mlink=5050,12413545,onproduct1_rr_3&clink=12413545#pdp-page-content'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
results = soup.find('script', id="pdpData").string
jsData = re.search(r'window.__ PRODUCT_PAGE_STATE __\s+=\s+', results)
data = json.loads(jsData.group(0))
and the part xxxxx I want is below in the script
window. __PRODUCT_PAGE_STATE __ = JSON.parse(xxxxx)
Through re.search to find window. __PRODUCT_PAGE_STATE __ , I still can not reach the part xxxxx.
Is there any other way to extract the info in the part xxxxx?
Try this:
import re
import json
from bs4 import BeautifulSoup
import requests
url = 'https://oldnavy.gap.com/browse/product.do?pid=599211032&rrec=true&mlink' \
'=5050,12413545,onproduct1_rr_3&clink=12413545#pdp-page-content'
soup = BeautifulSoup(
requests.get(url).content,
"html.parser",
).find('script', id="pdpData")
the_xxx_part = json.loads(
re.search(r"\.parse\((.+)\);", soup.string, re.S).group(1).strip(),
)
print(json.loads(the_xxx_part)["productData"]["name"])
Output:
Unisex Faux-Fur-Trim Hooded Frost-Free Puffer Jacket for Toddler
To print the entire JOSN object edit this:
print(json.loads(the_xxx_part)["productData"]["name"])
into this:
print(json.loads(the_xxx_part))

Beautiful Soup to extract Chemical Names

I am trying to extract the chemical names (All in CAPS lock) from the below URL.
https://www.legislation.gov.au/Details/F2020L01255
I am interested in the chemicals that is shown in Schedule 4.
import requests
import re
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = 'https://www.legislation.gov.au/Details/F2020L01255'
headers = {"Accept-Language": "EN-AU, en;q=0.5"}
results = requests.get(url, headers=headers)
soup = BeautifulSoup(results.text, "html.parser")
chemicals = []
chems_div = soup.find_all('div', class_='WordSection7')
I am stuck from here. The chemical names is wrapped around a P tag with class='MsoNormal' and Span tag with lang='EN-AU'.
Try this:
import requests
from bs4 import BeautifulSoup
url = 'https://www.legislation.gov.au/Details/F2020L01255'
headers = {"Accept-Language": "EN-AU, en;q=0.5"}
results = requests.get(url, headers=headers)
soup = BeautifulSoup(results.text, "html.parser")
chems_div = soup.find('div', class_='WordSection7')
all_spans = [
t.getText(strip=True) for t in
chems_div.find_all("span", {"lang": "EN-AU"})
]
print([w for w in all_spans if w.isupper() and w != "SCHEDULE 4"])
Output:
['ABACAVIR.', 'ABATACEPT.', 'ABIRATERONE ACETATE.', 'ABCIXIMAB.', 'ABEMACICLIB.', 'ACALABRUTINIB.', 'ACAMPROSATE CALCIUM.', 'ACARBOSE.', 'ACEBUTOLOL.', 'ACEPROMAZINE.', 'ACETARSOL.', 'ACETAZOLAMIDE.', 'ACETOHEXAMIDE.', 'ACETYL ISOVALERYLTYLOSIN.', 'ACETYLCARBROMAL.', 'ACETYLCHOLINE.', 'ACETYLDIGITOXIN.', 'ACETYLMETHYLDIMETHYLOXIMIDOPHENYLHYDRAZINE.', 'ACETYLSTROPHANTHIDIN.', 'ACIPIMOX.', '# ACITRETIN.', 'ACLIDINIUM BROMIDE.', 'ACOKANTHERA OUABAIO.', 'ACOKANTHERA SCHIMPERI.', 'ACRIVASTINE.', 'ADALIMUMAB.', 'ADAPALENE.', 'ADEFOVIR.', 'ADIPHENINE.', 'ADONIS VERNALIS.', 'ADRAFINIL.', 'AFAMELANOTIDE.', 'AFATINIB DIMALEATE.'
and so on...

How to get the data from <script> with no attributes inside <body>?

I am trying to extract all coordinates of locations of the restaurant using beautiful soup.How can i extract all of them from the script tag under the body?
from bs4 import BeautifulSoup as bs
import requests
import urllib2
import json
base_url = 'https://locations.wafflehouse.com/'
r = requests.get(base_url)
soup = bs(r.text, 'html.parser')
all_scripts = soup.find_all('script')
print all_scripts[19]
UPDATED ANSWER:
You need to parse the json in json.loads() and then navigate, try this code...working smoothly!
import json, requests
from bs4 import BeautifulSoup
req = requests.get('https://locations.wafflehouse.com/')
soup = BeautifulSoup(req.content, 'html.parser')
data = soup.find_all('script')[19].text.encode('utf-8')
jdata = data.replace('window.__SLS_REDUX_STATE__ =', '').replace(';', '')
data = json.loads(jdata)
for i in data['dataLocations']['collection']['features']:
LatLong = (i['geometry']['coordinates'])
print(LatLong)
Output:
[-90.073113, 30.37019]
[-84.131085, 33.952944]
[-78.719497, 36.14261]
[-95.629084, 29.947421]
[-83.9019, 33.56531]
[-80.091552, 37.288422]
[-77.949231, 34.237534]
[-96.60637, 32.968131]
[-80.969088, 29.151235]
[-86.843386, 33.354666]
[-84.206, 33.462175]
[-76.342464, 36.830187]
[-79.985822, 32.898412]
[-84.2784568595722, 33.
[-88.780694, 35.674914]
[-87.898899, 30.598605]
[-83.71487, 32.614092]
[-79.523611, 36.07101]
[-91.127792, 30.580582]
[-86.352681, 35.875097]
[-90.271372, 30.023002]
[-80.205641, 25.955672]
[-81.632, 30.157]
[-86.961821, 31.454352]
[-80.666906, 35.366769]
[-97.56596, 35.406447]
[-84.364334, 35.511474]
[-81.01622, 29.23453]
[-86.57177, 34.855504]
[-84.625908, 33.399829]
[-76.344303, 36.740862]
[-84.192634, 33.517948]
[-77.83421, 39.296024]
[-77.518985, 38.359332]
[-84.45238, 38.042061]
[-83.08319, 39.840191]
[-81.993971, 33.475816]
[-95.481102, 29.913294]
[-82.699, 28.334]
[-84.352035, 33.989889]
[-86.819468, 35.945115]
[-91.009638, 30.407864]
[-81.8428, 27.9096]

Categories

Resources