How to extract data using beautiful soup - python

import requests
from bs4 import BeautifulSoup
import pandas as pd
baseurl='https://locations.atipt.com/'
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
r =requests.get('https://locations.atipt.com/al')
soup=BeautifulSoup(r.content, 'html.parser')
tra = soup.find_all('ul',class_='list-unstyled')
productlinks=[]
for links in tra:
for link in links.find_all('a',href=True):
comp=baseurl+link['href']
productlinks.append(comp)
for link in productlinks:
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
tag=soup.find_all('div',class_='listing content-card')
for pro in tag:
tup=pro.find('a',class_='name').find_all('p')
for i in tup:
print(i.get_text())
I am trying to extract data but they will provide me nothing I try to extract data from the p tagthese is the page in which I try to extract data from p tag check it https://locations.atipt.com/al/alabaster

The working solution so far using css selectors to get data from p tags as follows:
import requests
from bs4 import BeautifulSoup
import pandas as pd
baseurl = 'https://locations.atipt.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
r = requests.get('https://locations.atipt.com/al')
soup = BeautifulSoup(r.content, 'html.parser')
tra = soup.find_all('ul', class_='list-unstyled')
productlinks = []
for links in tra:
for link in links.find_all('a', href=True):
comp = baseurl+link['href']
productlinks.append(comp)
for link in productlinks:
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
tag = ''.join([x.get_text(strip=True).replace('\xa0','') for x in soup.select('div.listing.content-card div:nth-child(2)>p')])
print(tag)
Output:
634 1st Street NSte 100Alabaster, AL35007
9256 Parkway ESte ABirmingham, AL352061940 28th Ave SBirmingham, AL352095431 Patrick WaySte 101Birmingham, AL35235833 St. Vincent's DrSte 100Birmingham, AL352051401 Doug Baker BlvdSte 104Birmingham, AL35242
1877 Cherokee Ave SWCullman, AL350551301-A Bridge Creek Dr NECullman, AL35055
1821 Beltline Rd SWSte BDecatur, AL35601
4825 Montgomery HwySte 103Dothan, AL36303
550 Fieldstown RdGardendale, AL35071323 Fieldstown Rd, Ste 105Gardendale, AL35071
2804 John Hawkins PkwySte 104Hoover, AL35244
700 Pelham Rd NorthJacksonville, AL36265
1811 Hwy 78 ESte 108 & 109Jasper, AL35501-4081
76359 AL-77Ste CLincoln, AL35096
1 College DriveStation #14Livingston, AL35470
106 6th Street SouthSte AOneonta, AL35121-1823
50 Commons WaySte DOxford, AL36203
301 Huntley PkwyPelham, AL35124
41 Eminence WaySte BPell City, AL35128
124 W Grand AveSte A-4Rainbow City, AL35906
1147 US-231Ste 9 & 10Troy, AL36081
7201 Happy Hollow RdTrussville, AL35173
100 Rice Mine Road LoopSte 102Tuscaloosa, AL354061451 Dr. Edward Hillard DrSte 130Tuscaloosa, AL35401
3735 Corporate Woods DrSte 109Vestavia, AL35242-2296
636 Montgomery HwyVestavia Hills, AL352161539 Montgomery HwySte 111Vestavia Hills, AL35216

Related

Beautiful soup doesn't get all elements

I'm trying to get all the street addresses that are on the right side of the page (https://www.zillow.com/homes/San-Francisco,-CA_rb/) but insted off getting all I get only 9 of them.
from bs4 import BeautifulSoup
import requests
header = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36",
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8"
}
response = requests.get(
"https://www.zillow.com/homes/San-Francisco,-CA_rb/",
headers=header)
data = response.text
soup = BeautifulSoup(data, "html.parser")
tag_adress = soup.find_all('address')
for x in tag_adress:
print(x)
The site uses an api to access the data. I got the URL from dev tools. The script displays 500 addresses (500 agent lists, as the page states).
import requests
import json
useragent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5026.0 Safari/537.36 Edg/103.0.1254.0"
# obtained url from dev tools
url = "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22San%20Francisco%2C%20CA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-122.63417331103516%2C%22east%22%3A-122.23248568896484%2C%22south%22%3A37.70660374673871%2C%22north%22%3A37.84391640339095%7D%2C%22mapZoom%22%3A12%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A20330%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22isAllHomes%22%3A%7B%22value%22%3Atrue%7D%2C%22sortSelection%22%3A%7B%22value%22%3A%22days%22%7D%7D%2C%22isListVisible%22%3Atrue%7D&wants={%22cat1%22:[%22mapResults%22]}&requestId=2"
page = requests.get(url, headers={"User-Agent": useragent})
page.raise_for_status()
data = json.loads(page.content)
results = data["cat1"]["searchResults"]["mapResults"]
print(f"found {len(results)} results")
for item in results:
address = item["address"]
if address != "--":
print(address)
Outputs:
found 500 results
1160 Mission St, San Francisco, CA
1000 N Point St, San Francisco, CA
750 Van Ness Ave, San Francisco, CA
3131 Pierce St, San Francisco, CA
2655 Bush St, San Francisco, CA
1288 Howard St, San Francisco, CA
765 Market St, San Francisco, CA
10 Innes Ct, San Francisco, CA
51 Innes Ct, San Francisco, CA
...

How to select and scrape specific texts out of a bunch <ul> and <li>?

I need to scrape "2015" and "09/09/2015" from the below link:
lacentrale.fr/auto-occasion-annonce-87102353714.html
But since there are many li and ul, I cant scrape the exact text. I used the below code Your help is highly appreciated.
from bs4 import BeautifulSoup
soup = BeautifulSoup(HTML)
soup.find('span', {'class':'optionLabel'}).find_next('span').get_text()
Fan of css selectors and :-soup-contains() as in #Andrejs answer mentioned. So just in case an alternative approach, if it comes to the point there are more options needed.
Generate a dict with all options pick the relevant value, by option label as key:
data = dict((e.button.text,e.find_next('span').text) for e in soup.select('.optionLabel'))
data lokks like:
{'Année': '2015', 'Mise en circulation': '09/09/2015', 'Contrôle technique': 'requis', 'Kilométrage compteur': '68 736 Km', 'Énergie': 'Electrique', 'Rechargeable': 'oui', 'Autonomie batterie': '190 Km', 'Capacité batterie': '22 kWh', 'Boîte de vitesse': 'automatique', 'Couleur extérieure': 'gris foncé metal', 'Couleur intérieure': 'cuir noir', 'Nombre de portes': '5', 'Nombre de places': '4', 'Garantie': '6 mois', 'Première main (déclaratif)': 'non', 'Nombre de propriétaires': '2', 'Puissance fiscale': '3 CV', 'Puissance din': '102 ch', 'Puissance moteur': '125 kW', "Crit'Air": '0', 'Émissions de CO2': '0 g/kmA', 'Norme Euro': 'EURO6', 'Prime à la conversion': ''}
Example
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'}
url = 'https://www.lacentrale.fr/auto-occasion-annonce-87102353714.html'
soup = BeautifulSoup(requests.get(url, headers=headers).text)
data = dict((e.button.text,e.find_next('span').text) for e in soup.select('.optionLabel'))
print(data['Année'], data['Mise en circulation'], sep='\n')
Output
2015
09/09/2015
Try:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0"
}
url = "https://www.lacentrale.fr/auto-occasion-annonce-87102353714.html"
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
v1 = soup.select_one('.optionLabel:-soup-contains("Année") + span')
v2 = soup.select_one(
'.optionLabel:-soup-contains("Mise en circulation") + span'
)
print(v1.text)
print(v2.text)
Prints:
2015
09/09/2015

Create Rows and Columns in BeautifulSoup

Below is code python code output.I want output in rows and column in dataframe:
response = requests.get(source_data)
soup = BeautifulSoup(response.text, "html.parser")
States = soup.find_all('div',class_ = 'card bg-darker p-3 mb-3')
for item in States :
state_name = item.find(class_='fw-bold fs-5 mb-2').text
vaccinated_per = item.find(class_='col-3 text-end fs-5 ff-s text-success').text
print(state_name,vaccinated_per)
Output:
Flanders 80.24%
Wallonia 70.00%
Brussels 56.73%
Ostbelgien 65.11%
Collect your information in a list of dicts and then simply create a data frame from it:
data = []
for item in States :
data.append({
'state_name' : item.find(class_='fw-bold fs-5 mb-2').text,
'vaccinated_per' : item.find(class_='col-3 text-end fs-5 ff-s text-success').text
})
pd.DataFrame(data)
Example
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
response = requests.get('https://covid-vaccinatie.be/en', headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
States = soup.find_all('div',class_ = 'card bg-darker p-3 mb-3')
data = []
for item in States :
data.append({
'state_name' : item.find(class_='fw-bold fs-5 mb-2').text,
'vaccinated_per' : item.find(class_='col-3 text-end fs-5 ff-s text-success').text
})
pd.DataFrame(data)
Output
state_name vaccinated_per
0 Flanders 80.24%
1 Wallonia 70.00%
2 Brussels 56.73%
3 Ostbelgien 65.11%

I want to scrape these product information

import requests
from bs4 import BeautifulSoup
import pandas as pd
baseurl='https://twillmkt.com'
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
r =requests.get('https://twillmkt.com/collections/denim')
soup=BeautifulSoup(r.content, 'html.parser')
tra = soup.find_all('div',class_='ProductItem__Wrapper')
productlinks=[]
Title=[]
Brand=[]
Colour=[]
for links in tra:
for link in links.find_all('a',href=True)[1:]:
comp=baseurl+link['href']
productlinks.append(comp)
for link in productlinks:
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
title=soup.find('h1').text
Title.append(title)
price=soup.find('span',class_="money").text
Brand.append(price)
Product_Features=soup.find_all('li').text
Colour.append(Product_Features)
df = pd.DataFrame(
{"Title": Title, "Price": Brand,"Product_Features":Colour}
)
print(df)
I scrape the title and price but difficult to scrape these information SIZE,product feature,material,model size,image
Single Page Link
https://twillmkt.com/products/light-blue-butterfly-print-slim-leg-denim?variant=39498848403534
i'll give you the css selectors for the elements you want. you can catch them with soup.select()
size: select[id^="product-select"] option
product feature: div.ProductMeta__Description.Rte li
material: div.ProductMeta__Description.Rte p:nth-of-type(2)
model size: div.ProductMeta__Description.Rte p:nth-of-type(3)
image: div[id^="Image"] span img

How to grab specific items from entire json response api calls

I want to grab only Symbol and Company Name items from the entire json data but getting
all data. How I can get above mentioned data and store in pandas DataFrame.
Base_url
My code:
import requests
import pandas as pd
params = {
'sectorID': 'All',
'_': '1630217365368'}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
sym = []
name = []
r = req.get(url, params=params, headers =headers)
for item in r.json()['data']:
print(item)
# sym.append(item['symbol']),
# name.append(item['lonaName'])
# df = pd.DataFrame(sym, name, columns=[["Symble","Company name"]])
# print(df)
main('https://www.saudiexchange.sa/wps/portal/tadawul/market-participants/issuers/issuers-directory/!ut/p/z1/04_Sj9CPykssy0xPLMnMz0vMAfIjo8zi_Tx8nD0MLIy8DTyMXAwczVy9vV2cTY0MnEz1w8EKjIycLQwtTQx8DHzMDYEK3A08A31NjA0CjfWjSNLv7ulnbuAY6OgR5hYWYgzUQpl-AxPi9BvgAI4GhPVHgZXgCwFUBVi8iFcByA9gBXgcWZAbGhoaYZDpma6oCABqndOv/p0/IZ7_NHLCH082KOAG20A6BDUU6K3082=CZ6_NHLCH082K0H2D0A6EKKDC520B5=N/')
you need to fix the way you are creating the dataframe:
import requests
import pandas as pd
params = {
'sectorID': 'All',
'_': '1630217365368'}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
sym = []
name = []
r = req.get(url, params=params, headers =headers)
for item in r.json()['data']:
# print(item)
sym.append(item['symbol']),
name.append(item['lonaName'])
df = pd.DataFrame({'symbol':sym , 'longName':name})
print(df)
main('https://www.saudiexchange.sa/wps/portal/tadawul/market-participants/issuers/issuers-directory/!ut/p/z1/04_Sj9CPykssy0xPLMnMz0vMAfIjo8zi_Tx8nD0MLIy8DTyMXAwczVy9vV2cTY0MnEz1w8EKjIycLQwtTQx8DHzMDYEK3A08A31NjA0CjfWjSNLv7ulnbuAY6OgR5hYWYgzUQpl-AxPi9BvgAI4GhPVHgZXgCwFUBVi8iFcByA9gBXgcWZAbGhoaYZDpma6oCABqndOv/p0/IZ7_NHLCH082KOAG20A6BDUU6K3082=CZ6_NHLCH082K0H2D0A6EKKDC520B5=N/')
symbol longName
0 1330 Abdullah A. M. Al-Khodari Sons Co.
1 4001 Abdullah Al Othaim Markets Co.
2 4191 Abdullah Saad Mohammed Abo Moati for Bookstore...
3 1820 Abdulmohsen Alhokair Group for Tourism and Dev...
4 2330 Advanced Petrochemical Co.
.. ... ...
199 3020 Yamama Cement Co.
200 3060 Yanbu Cement Co.
201 2290 Yanbu National Petrochemical Co.
202 3007 Zahrat Al Waha for Trading Co.
203 2240 Zamil Industrial Investment Co.
To get all data from the site, you can use their API:
import requests
import pandas as pd
url = "https://www.saudiexchange.sa/tadawul.eportal.theme.helper/TickerServlet"
data = requests.get(url).json()
# print(json.dumps(data, indent=4))
df = pd.json_normalize(data["stockData"])
print(df)
Prints:
pk_rf_company companyShortNameEn companyShortNameAr companyLongNameEn companyLongNameAr highPrice lowPrice noOfTrades previousClosePrice todaysOpen transactionDate turnOver volumeTraded aveTradeSize change changePercent lastTradePrice transactionDateStr
0 4700 Alkhabeer Income الخبير للدخل Al Khabeer Diversified Income Traded Fund صندوق الخبير للدخل المتنوع المتداول None None 308 None None None 1.293560e+06 142791 463.61 0.01 0.11 9.07 None
1 2030 SARCO المصافي Saudi Arabia Refineries Co. شركة المصافي العربية السعودية None None 877 None None None 1.352797e+07 83391 95.09 -0.40 -0.25 162.20 None
2 2222 SAUDI ARAMCO أرامكو السعودية Saudi Arabian Oil Co. شركة الزيت العربية السعودية None None 4054 None None None 6.034732e+07 1731463 427.10 0.05 0.14 34.90 None
...and so on.
To get only symbol/company name:
print(df[["pk_rf_company", "companyLongNameEn"]])
pk_rf_company companyLongNameEn
0 4700 Al Khabeer Diversified Income Traded Fund
1 2030 Saudi Arabia Refineries Co.
2 2222 Saudi Arabian Oil Co.
...and so on.
It will be way faster if you store data in pandas DataFrame and later process it.
Example Code:
import requests
import pandas as pd
params = {
'sectorID': 'All',
'_': '1630217365368'}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
r = req.get(url, params=params, headers =headers)
data = r.json()['data']
df_main = pd.DataFrame(data)
df_min = df_main.iloc[:, 0:2]
df_min.columns = ['Symbol', 'Company name']
print(df_min)
main('https://www.saudiexchange.sa/wps/portal/tadawul/market-participants/issuers/issuers-directory/!ut/p/z1/04_Sj9CPykssy0xPLMnMz0vMAfIjo8zi_Tx8nD0MLIy8DTyMXAwczVy9vV2cTY0MnEz1w8EKjIycLQwtTQx8DHzMDYEK3A08A31NjA0CjfWjSNLv7ulnbuAY6OgR5hYWYgzUQpl-AxPi9BvgAI4GhPVHgZXgCwFUBVi8iFcByA9gBXgcWZAbGhoaYZDpma6oCABqndOv/p0/IZ7_NHLCH082KOAG20A6BDUU6K3082=CZ6_NHLCH082K0H2D0A6EKKDC520B5=N/')
Output:

Categories

Resources