Access Hidden Data on a page

Access Hidden Data on a page - python

I need to access the following website: http://mothoq.com/store/22
scroll down till i see the phone icon.
click on it, and scrape the phone number.
I have successfully connected to the website, and able to scrape all data needed, except of the phone number.
I have tried to use
soup.find_all('p',attrs={"align":"center"})
my code is:
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "html5lib")
results = soup.find('div', attrs={'id': 'subtitle'})
for storeData in results:
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
for storeContact in contacts:
storePhone = soup.find_all('p', attrs={"align":"center"})
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"})['href']
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"})['href']
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"})['href']
print(storePhone)
Thanks!

You should search for hidden div with id="store-telephone-form" and take second
<p> tag from it.
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "lxml")
results = soup.find('div', attrs={'id': 'subtitle'})
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
try:
storePhone = soup.find('div', attrs={"id":"store-telephone-form"}).select('p')[1].text
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"}).get('href')
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"}).get('href')
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"}).get('href')
except:
pass
print(storePhone)

Related

Web scraping doesn't iterate over entire webpage

Im trying to scrape the information of all the player names and player rating from this website:
https://www.fifaindex.com/players/?gender=0&league=1&order=desc
But i only get the information from the first player on the page.
The code im using:
from bs4 import BeautifulSoup
import requests
url = "https://www.fifaindex.com/players/?gender=0&league=1&order=desc"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find_all('div', class_="responsive-table table-rounded")
for result in results:
rating = result.find("span", class_="badge badge-dark rating r3").text
name = result.find("a", class_="link-player")
info = [rating, name]
print(info)
The HTML parsed is attached in the picture

I tinkered around a little bit and I think I got a version that does what you want
from bs4 import BeautifulSoup
import requests
page = requests.get("https://www.fifaindex.com/players/?
gender=0&league=1&order=desc")
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find_all("tr")
for result in results:
try:
result["data-playerid"]
except KeyError:
continue
rating = result.find("span", class_="badge badge-dark rating r3").text
name = result.find("a", class_="link-player")
info = [rating, name]
print(info)

Get all table lines with a data-playerid attribute will fix it:
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import requests
url = "https://www.fifaindex.com/players/?gender=0&league=1&order=desc"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
results = soup.find_all('tr', {'data-playerid': True})
for res in results:
rating = res.find("span", class_="badge badge-dark rating r3").text
name = res.find("a", class_="link-player")
info = [rating, name]
print(info)

how to get all 'href' from page in company profile box

Can anyone tell me where is the prblm i am new in python i want get all links from this page here is my code
import requests
from bs4 import BeautifulSoup
import pandas as pd
re=requests.get('https://www.industrystock.com/en/companies/Agriculture')
re
soup = BeautifulSoup(re.text, 'lxml')
link_list = []
page1 = soup.find_all('a', class_ = 'btn awe-info gotoJS iconColor_white')
page1
for i in page1:
link = (i.get('href'))
link_list.append(link)

The links to company profiles are stored in data-href= attribute:
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.industrystock.com/en/companies/Agriculture")
soup = BeautifulSoup(r.content, "lxml")
page1 = soup.find_all("a", class_="btn awe-info gotoJS iconColor_white")
for i in page1:
print(i["data-href"])
Prints:
https://www.industrystock.com/en/company/profile/ARCA-Internet-Services-Ltd./370071
https://www.industrystock.com/en/company/profile/Забайкальская-аграрная-Ассоциация-образовательных-и-научных-учреждений/256182
https://www.industrystock.com/en/company/profile/...VÁŠ-INTERIÉR-s.r.o./534809
https://www.industrystock.com/en/company/profile/1-WITOS-s.r.o./529071
https://www.industrystock.com/en/company/profile/1.-TOUŠEŇSKÁ-s.r.o./544981
https://www.industrystock.com/en/company/profile/1.HEFAISTOS-s.r.o./541263
https://www.industrystock.com/en/company/profile/1.HRADECKÁ-ZEMĚDĚLSKÁ-a.s./548267
https://www.industrystock.com/en/company/profile/1.MAXIMA-INTERNATIONAL-s.r.o./530049
https://www.industrystock.com/en/company/profile/1.MIROSLAVSKÁ-STROJÍRNA-spol.-s-r.o./544781
https://www.industrystock.com/en/company/profile/1.VASTO-spol.-s-r.o./535985
https://www.industrystock.com/en/company/profile/1C-PRO-s.r.o./534831
https://www.industrystock.com/en/company/profile/1CSC-a.s./528169
https://www.industrystock.com/en/company/profile/1P-CONTROL/549995
https://www.industrystock.com/en/company/profile/2-ES-spol.-s-r.o./547849
https://www.industrystock.com/en/company/profile/2-G-SERVIS-spol.-s-r.o./528391
https://www.industrystock.com/en/company/profile/2-JCP-a.s./537151
https://www.industrystock.com/en/company/profile/2-THETA-ASE-s.r.o./545079
https://www.industrystock.com/en/company/profile/2LMAKERS-s.r.o./542127
https://www.industrystock.com/en/company/profile/2M-SERVIS-s.r.o./550923
https://www.industrystock.com/en/company/profile/2M-STATIC-s.r.o./549935
https://www.industrystock.com/en/company/profile/2M-STROJE-s.r.o./539885
https://www.industrystock.com/en/company/profile/2TMOTORS-s.r.o./543869
https://www.industrystock.com/en/company/profile/2VV-s.r.o./538993
https://www.industrystock.com/en/company/profile/2xSERVIS-s.r.o./528321
https://www.industrystock.com/en/company/profile/3-PLUS-1-SERVICE-s.r.o./535103
https://www.industrystock.com/en/company/profile/3-TOOLING-s.r.o./540599
https://www.industrystock.com/en/company/profile/3B-SOCIÁLNÍ-FIRMA-s.r.o./535127
https://www.industrystock.com/en/company/profile/3D-KOVÁRNA-s.r.o./549765
https://www.industrystock.com/en/company/profile/3D-TECH-spol.-s-r.o./548047
https://www.industrystock.com/en/company/profile/3DNC-SYSTEMS-s.r.o./549379

Try this:
response = requests.get('https://www.industrystock.com/en/companies/Agriculture')
soup = BeautifulSoup(response.text, 'lxml')
link_list = []
page1 = soup.find_all('a', {"class":'btn awe-info gotoJS iconColor_white'})
for i in page1:
link = i['href']
link_list.append(link)
And I would also recommend using html.parser if you are not scraping XML.

Class of every same page on same website changes. I am trying scrape description from page

import requests
from bs4 import BeautifulSoup
import numpy as np
import re
import json
title = []
pages = np.arange(1,13)
for page in pages:
url = 'https://www.jobs.ch/en/vacancies/?page='+str(page)+'&term=python%20web'
print(url)
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
jobs = soup.find_all('a', class_='sc-hGPAah Link-sc-1vy3ms6-1 jegSWD', href=True, title=True)
for job in jobs:
job_title = job['title']
print(job_title)
title.append(job_title)
job_half_url = job['href']
job_full_url = 'https://www.jobs.ch'+str(job_half_url)
print(job_full_url)
data = re.search(r"__INIT__ = (\{.*\})", requests.get(job_full_url).text).group(1)
data = json.loads(data)
# print(json.dumps(data, indent=4))
for j in data["lists"]["jobs"].values():
soup2 = BeautifulSoup(j["template"], "html.parser")
# print(soup2)
for t in soup2.find_all(class_="col-xs-12 col-md-9 col-md-push-3"):
desc.append(t.get_text(strip=True, separator="\n"))
print(t.get_text(strip=True, separator="\n"))
I am trying to scrape description of page. but every page changes it's div class name.
in line for t in soup2.find-all(class_ = "here name changes"):

Use stable looking attributes/types and their relationship e.g.
print(soup2.select_one('.content-row > div:nth-child(1)').get_text(strip=True, separator="\n"))
Here I rely on the child div relationship to a more stable looking parent class.

python bs4 find() return none

I am getting none by using bs4 find() function even it is exists in the html. I am trying to get all div with a class tab_content.I am finding this on this link https://sofad.qc.ca/index.php?id_product=464&controller=product&id_lang=1. So kindly suggest me how to do this in a right way.
This is the code:
from bs4 import BeautifulSoup as bs
import requests
url = 'https://sofad.qc.ca/index.php?id_category=78&controller=category&id_lang=1'
r = requests.get(url)
soup = bs(r.content, 'html.parser')
tb = soup.find_all('a', class_='product_img_link')
for item in tb:
link = item.get('href')
r = requests.get(link)
soup = bs(r.content, 'lxml')
try:
title = soup.find('h1', {'itemprop':'name'}).text
except:
title = ''
try:
price = soup.find('span', id='our_price_display').text
except:
price = ''
try:
img = soup.find('img', id='bigpic').get('src')
except:
img = ''
try:
dv = " ".join(soup.find('div', class_='rte').text.split())
except:
dv = ''
for dvv in soup.find_all('div', class_='tab_content'):
print(dvv)

As far as I know, the syntax is findAll NOT find_all!
To find a div with a class of tab_content:
tab_content = soup.findAll('div', class_='tab_content')
FYI, there is no div with tab_content class in the html tree on that page.

Selecting HTML page values using Beautiful Soup

I have been trying to figure this out for a few hour and it is doing my head in. Every method I try is not presenting the correct value.
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.off---white.com/en/GB/products/omia065s188000160100')
soup = BeautifulSoup(r.content, 'html.parser')
I want to extract the following values from the webpage (https://www.off---white.com/en/GB/products/omia065s188000160100)
Name = LOW 3.0 SNEAKER
Price = £ 415
img_url = https://cdn.off---white.com/images/156365/large_OMIA065S188000160100_4.jpg?1498202305
How would I extract these 3 values using Beautiful Soup?

import requests
from bs4 import BeautifulSoup
# Get prod name
r = requests.get('https://www.off---white.com/en/GB/products/omia065s188000160100')
soup = BeautifulSoup(r.text, 'html.parser')
spans = soup.find_all('span', {'class' : 'prod-title'})
data = [span.get_text() for span in spans]
prod_name = ''.join(data)
# Find prod price
spans = soup.find_all('div', {'class' : 'price'})
data = [span.get_text() for span in spans]
prod_price = ''.join(data)
# Find prod img
spans = soup.find_all('img', {'id' : 'image-0'})
for meta in spans:
prod_img = meta.attrs['src']
print(prod_name)
print(prod_price)
print(prod_img)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Access Hidden Data on a page - python

Related

Web scraping doesn't iterate over entire webpage

how to get all 'href' from page in company profile box

Class of every same page on same website changes. I am trying scrape description from page

python bs4 find() return none

Selecting HTML page values using Beautiful Soup

Categories

Resources