Retrieving content using Beautifulsoup and selectors - python

Trying to retrieve content (text) embedded in html. Not getting the content.
Trying to use selector in the format to find price_box:
price_box = soup2.find('div', attrs={'title class': 'Fw(600)'})
# Import libraries
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
# Set the URL you want to webscrape from
url = 'https://finance.yahoo.com/quote/NVDA?p=NVDA'
# Connect to the URL
response = requests.get(url)
# Parse HTML and save to BeautifulSoup object¶
soup = BeautifulSoup(response.text, "html.parser")
beta = soup.find('h1')
#print (beta)
link = beta.contents
variable = 'NVDA - NVIDIA Corporation'
test = 'NVDA - NVIDIA Corporation'
#<..>
url2 = 'https://finance.yahoo.com/calendar/earnings?from=2019-09-01&to=2019-09-07&day=2019-09-01'
response2 = requests.get(url2)
soup2 = BeautifulSoup(response2.text, "html.parser")
# alpha = soup2.find('')
# div = soup.find('a', {class_ ='D(ib) '})
# text = div.string
price_box = soup2.find('div', attrs={'title class': 'Fw(600)'})
#price = price_box.text
print("Price Box: "+ str(price_box)) # THIS IS WHAT I WANT
Was hoping to see "Senea". Instead seeing "None" - "Price Box:
None"

A lot of the content is dynamic. You can regex out that info easily
import requests, re
p = re.compile(r'"YFINANCE:(.*?)"')
r = requests.get('https://finance.yahoo.com/calendar/earnings?from=2019-09-01&to=2019-09-07&day=2019-09-01&guccounter=1')
print(p.findall(r.text)[0])
An alternate is to avoid the dynamic looking classes altogether
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://finance.yahoo.com/calendar/earnings?from=2019-09-01&to=2019-09-07&day=2019-09-01&guccounter=1')
soup = bs(r.content, 'lxml')
print(soup.select_one('#cal-res-table a').text)
Reading:
css selectors

Related

python beautifulsoup to download specific .zip files from a website

I am new to web scraping. I am trying to download only the weekly .zip files from the below website. I could able to parse the label and couldn't go beyond that to download the weekly zip files.
https://download.cms.gov/nppes/NPI_Files.html
html code for 'li' tag
import requests
from bs4 import BeautifulSoup
URL = "https://download.cms.gov/nppes/NPI_Files.html"
r = requests.get(URL)
#print(r.content)
soup = BeautifulSoup(r.content, 'html.parser')
#print(soup.prettify())
links=[]
ref = soup.select('li')
#print(ref)
for i in ref:
Try the next example
import requests
from bs4 import BeautifulSoup
URL = "https://download.cms.gov/nppes/NPI_Files.html"
r = requests.get(URL)
#print(r.content)
soup = BeautifulSoup(r.content, 'html.parser')
#print(soup.prettify())
links=[]
for ref in soup.select('div.bulletlistleft > ul > li')[2:]:
zip ='https://download.cms.gov/nppes'+ref.a.get('href').replace('./','/')
links.append(zip)
print(links)
Output:
['https://download.cms.gov/nppes/NPPES_Data_Dissemination_090522_091122_Weekly.zip', 'https://download.cms.gov/nppes/NPPES_Data_Dissemination_091222_091822_Weekly.zip', 'https://download.cms.gov/nppes/NPPES_Data_Dissemination_091922_092522_Weekly.zip']
Access a tags if there are any:
for i in ref:
if i.a is not None:
print(i.a.get('href'))
You can either just select all links then filter them down to the weekly updates:
ref = [
a for a in soup.select('a') if a.text
and 'Weekly Update' in a.text
]
OR you can narrow down to a parent element before selecting the links:
ref = soup.find(
'b', string='Weekly Incremental NPI Files'
).find_next('tr').select('a')
Either way, you'll end up with the same set of links:
The following code will find the links, download the files, and unzip them.
import requests
from bs4 import BeautifulSoup
import os
from urllib.request import urlretrieve
import zipfile
target_folder = os.path.dirname(os.path.realpath(__file__))
base_url = "https://download.cms.gov/nppes"
url = f"{base_url}/NPI_Files.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
elms = soup.select('div.bulletlistleft > ul > li > a')
for elm in elms:
zip_filename = elm.attrs['href'].lstrip('./')
zip_full_url = "/".join((base_url, zip_filename))
target_zip_path = os.path.join(target_folder, zip_filename)
target_zip_dir = ".".join(target_zip_path.split('.')[:-1])
urlretrieve(zip_full_url, target_zip_path)
with zipfile.ZipFile(target_zip_path, 'r') as zip_file:
zip_file.extractall(target_zip_dir)

Can't scrape all of ul tags from a table

I'm trying to scrape all of proxy ips from this site : https://proxy-list.org/english/index.php but i can only get one ip at most
here is my code :
from helium import *
from bs4 import BeautifulSoup
url = 'https://proxy-list.org/english/index.php'
browser = start_chrome(url, headless=True)
soup = BeautifulSoup(browser.page_source, 'html.parser')
proxies = soup.find_all('div', {'class':'table'})
for ips in proxies:
print(ips.find('li', {'class':'proxy'}).text)
i tried to use ips.find_all but it didn't work.
from bs4 import BeautifulSoup
import requests
url = 'https://proxy-list.org/english/index.php'
pagecontent = requests.get(url)
soup = BeautifulSoup(browser.pagecontent, 'html.parser')
maintable = soup.find_all('div', {'class':'table'})
for div_element in maintable:
rows = div_element.find_all('li', class_='proxy')
for ip in rows:
print(ip.text)
If I get your question right, the following is one of the ways how you can fetch those proxies using requests module and Beautifulsoup library:
import re
import base64
import requests
from bs4 import BeautifulSoup
url = 'https://proxy-list.org/english/index.php'
def decode_proxy(target_str):
converted_proxy = base64.b64decode(target_str)
return converted_proxy.decode()
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')
for tr in soup.select("#proxy-table li.proxy > script"):
proxy_id = re.findall(r"Proxy[^']+(.*)\'",tr.contents[0])[0]
print(decode_proxy(proxy_id))
First few results:
62.80.180.111:8080
68.183.221.156:38159
189.201.134.13:8080
178.60.201.44:8080
128.199.79.15:8080
139.59.78.193:8080
103.148.216.5:80

HTML Parsing using bs4

I am parsing an HTMl page and am having a hard time figuring out how to pull a certain 'p' tag without a class or on id. I am trying to reach the tag of 'p' with the lat and long. Here is my current code:
import bs4
from urllib import urlopen as uReq #this opens the URL
from bs4 import BeautifulSoup as soup #parses/cuts the html
my_url = 'http://www.fortwiki.com/Battery_Adair'
print(my_url)
uClient = uReq(my_url) #opens the HTML and stores it in uClients
page_html = uClient.read() # reads the URL
uClient.close() # closes the URL
page_soup = soup(page_html, "html.parser") #parses/cuts the HTML
containers = page_soup.find_all("table")
for container in containers:
title = container.tr.p.b.text.strip()
history = container.tr.p.text.strip()
lat_long = container.tr.table
print(title)
print(history)
print(lat_long)
Link to website: http://www.fortwiki.com/Battery_Adair
The <p> tag you're looking for is very common in the document, and it doesn't have any unique attributes, so we can't select it directly.
A possible solution would be to select the tag by index, as in bloopiebloopie's answer.
However that won't work unless you know the exact position of the tag.
Another possible solution would be to find a neighbouring tag that has distinguishing attributes/text and select our tag in relation to that.
In this case we can find the previous tag with text: "Maps & Images", and use find_next to select the next tag.
import requests
from bs4 import BeautifulSoup
url = 'http://www.fortwiki.com/Battery_Adair'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
b = soup.find('b', text='Maps & Images')
if b:
lat_long = b.find_next().text
This method should find the coordinates data in any www.fortwiki.com page that has a map.
You can use re to match partial text inside a tag.
import re
import requests
from bs4 import BeautifulSoup
url = 'http://www.fortwiki.com/Battery_Adair'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
lat_long = soup.find('p', text=re.compile('Lat:\s\d+\.\d+\sLong:')).text
print(lat_long)
# Lat: 24.5477038 Long: -81.8104541
I am not exactly sure what you want but this works for me. There are probably neeter ways of doing it. I am new to python
soup = BeautifulSoup(requests.get("http://www.fortwiki.com/Battery_Adair").content, "html.parser")
x = soup.find("div", id="mw-content-text").find("table").find_all("p")[8]
x = x.get_text()
x = x.split("Long:")
lat = x[0].split(" ")[1]
long = x[1]
print("LAT = " + lat)
# LAT = 24.5477038
print("LNG = " + long)
# LNG = -81.8104541

Access Hidden Data on a page

I need to access the following website: http://mothoq.com/store/22
scroll down till i see the phone icon.
click on it, and scrape the phone number.
I have successfully connected to the website, and able to scrape all data needed, except of the phone number.
I have tried to use
soup.find_all('p',attrs={"align":"center"})
my code is:
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "html5lib")
results = soup.find('div', attrs={'id': 'subtitle'})
for storeData in results:
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
for storeContact in contacts:
storePhone = soup.find_all('p', attrs={"align":"center"})
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"})['href']
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"})['href']
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"})['href']
print(storePhone)
Thanks!
You should search for hidden div with id="store-telephone-form" and take second
<p> tag from it.
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "lxml")
results = soup.find('div', attrs={'id': 'subtitle'})
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
try:
storePhone = soup.find('div', attrs={"id":"store-telephone-form"}).select('p')[1].text
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"}).get('href')
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"}).get('href')
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"}).get('href')
except:
pass
print(storePhone)

Problems retrieving information from imdb

I'm trying to get the movie titles from an imdb watchlist. This is my code:
import requests, bs4
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
print(soup.find_all('.lister-item-header'))
Even though '.lister-item-header' exists in the chrome developer console it doesn't exist in the html file that the requests module downloaded. I've also tried using regular expressions. What would be the best way of retrieving the titles?
You should select elements by their class in this way.
import requests
import bs4
url = 'http://www.imdb.com/chart/top'
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
rows = soup.select('.titleColumn > a')
for row in rows:
print(row.text)
Or you can do it in this way:
import requests
import bs4
url = 'http://www.imdb.com/chart/top'
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
rows = soup.find_all('td', class_='titleColumn')
for row in rows:
print(row.a.text)
The data is load from a json object which is embedded into the raw html file, so we can parse it and get the title.
import requests
import bs4
import json
url = 'http://www.imdb.com/user/ur69187878/watchlist?ref_=wt_nv_wl‌​_all_1'
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
# rows = soup.find_all('h3', class_='list-item-header')
js_elements = soup.find_all('script')
js_text = None
search_str = 'IMDbReactInitialState.push('
for element in js_elements:
text = element.text
if search_str in text:
js_text = text.strip()
break
json_start = js_text.index(search_str) + len(search_str)
json_text = js_text[json_start:-2]
json_obj = json.loads(js_text[json_start:-2])
for title in json_obj['titles']:
json_title = json_obj['titles'][title]
print(json_title['primary']['title'])
But I have to say that this is not a general method to attack this kind of problems, if you wanna have a general solution for all pages whose data is loaded from json or api, you can use some other ways such as Selenium.

Categories

Resources