How can I import values from given URL to python? - python

How can I extract the value of the last price of the strike price 12,000.00 from the given URL in python?
https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=-10006&symbol=NIFTY&symbol=NIFTY&instrument=-&date=-&segmentLink=17&symbolCount=2&segmentLink=17
LTP of 12,000.00 strike price is 25.35.

With bs4 4.7.1 use :has and :contains. Use :contains with td:nth-of-type to search the right column, then :has to retrieve the parent row and descendant combinator and td:nth-of-type again, to get the ltp column value for that row.
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=-10006&symbol=NIFTY&symbol=NIFTY&instrument=-&date=-&segmentLink=17&symbolCount=2&segmentLink=17')
soup = bs(r.content, 'lxml')
ltp = soup.select_one('#octable tr:has(td:nth-of-type(12):contains("12000.00")) td:nth-of-type(6)').text.strip()

import requests
from bs4 import BeautifulSoup
page = requests.get('https://nseindia.com/live_market/dynaContent/live_watch/option_chain/optionKeys.jsp?symbolCode=-10006&symbol=NIFTY&symbol=NIFTY&instrument=-&date=-&segmentLink=17&symbolCount=2&segmentLink=17')
soup = BeautifulSoup(page.content,"lxml")
data = []
for tr in soup.select('table#octable tr')[2:-1]:
data.append([td.text.strip() for td in tr.select('td')])
def get_ltp(data, strike_price):
for d in data:
if strike_price == d[11]:
return d[5]
print(get_ltp(data, '12000.00'))
Prints:
25.35

Related

Beautiful soup select two item with parent child realtionship

The code below finds all the links with gameId and puts the links in a dataframe. My issue is that I am not sure how I store them in a dataframe with the corresponding date. In this case the h2 is the parent tag with the child tag having the links. The code below get the links but how get date for each gameId.
import pandas as pd
import requests
from bs4 import BeautifulSoup
gmdf = pd.DataFrame(columns=['link','gamedate'])
url = 'https://www.espn.com/nfl/schedule/_/week/1/year/2020'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.select('a')
for link in links:
if 'gameId' in link.get('href'):
print(link.get('href'))
hlink = 'https://www.espn.com' + link.get('href')
gmdf = gmdf.append({'link': hlink}, ignore_index=True)
This line to get the dates on the page but I need the dates with corresponding gameid in the data frame.
soup.select('h2')
Here is an alternative method from what Dhivakar has already provided. In it, I add the h2 tag in the original selection by BeautifulSoup, then, I set the date based on when the link does not have an href, since we know it must either be h2 or a tag, and h2 tags contain the dates.
import pandas as pd
import requests
from bs4 import BeautifulSoup
gmdf = pd.DataFrame(columns=['link','gamedate'])
url = 'https://www.espn.com/nfl/schedule/_/week/1/year/2020'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.select('a, h2')
date = ""
for link in links:
if link.get('href') is None:
date = link.text
print(date)
elif link.get('href') is not None and 'gameId' in link.get('href'):
print(date)
print(link.get('href'))
hlink = 'https://www.espn.com' + link.get('href')
gmdf = gmdf.append({'link': hlink, 'gamedate': date}, ignore_index=True)
print(gmdf)
Output:
You can simply use a nested loop within a list comprehension to loop those date headers then find the next table and grab the list of href containing the substring of interest (gameId). That way you can have the relevant date listed against each link in a list of tuples you convert to a DataFrame:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
r = requests.get('https://www.espn.com/nfl/schedule/_/week/1/year/2020')
soup = bs(r.text, 'lxml')
df = pd.DataFrame([('https://www.espn.com' + j['href'], i.text) for i in soup.select(
'#sched-container .table-caption') for j in i.find_next('table').select('[href*=gameId]')], columns=['link', 'date'])
print(df)
You can grab parent and siblings of elements just like in JavaScript.
Replace this after links = soup.select('a'),
schedule_year = soup.select_one('.automated-header h1').text.split("- ")[-1] # For the schedule year
for link in links:
if 'gameId' in link.get('href'):
schedule_date = link.parent.parent.parent.parent.parent.previous_sibling.text.split(", ")[-1] + " " + schedule_year # Grabs the h2 tag
schedule_date = datetime.datetime.strptime(schedule_date, "%B %d %Y") # Converted the date to datetime object for manipulation
hlink = 'https://www.espn.com' + link.get('href')
gmdf = gmdf.append({'link': hlink, 'gamedate': schedule_date}, ignore_index=True)

Creating a for Loop in bs4 for Ebay listings

I have the following code that goes to an ebay url and extracts the name of a listing and sale price, see below:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import request
html = urlopen('https://www.ebay.com/sch/i.html?_from=R40&_nkw=manga&_sacat=0&rt=nc&LH_Sold=1&LH_Complete=1')
soup = BeautifulSoup(html.read(), 'html.parser')
soldItem = soup.find('h3', class_='s-item__title s-item__title--has-tags')
salePrice = soup.find('span', class_='POSITIVE')
#data = soup.find('div', class_='s-item__info clearfix')
itemData = {soldItem.get_text():salePrice.get_text()}
I want to create a for loop that iterates over the first page and gives me the name and sale price of every listing.
However, every single attempt that I've made returns either the same listing five times or all sold items and thereafter all sale prices.
Any hints as to how to format my for loop ?
You can try this code that will create all items and their prices in a dictionary by the key of the item name and the value of the price.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
html = urlopen('https://www.ebay.com/sch/i.html?_from=R40&_nkw=manga&_sacat=0&rt=nc&LH_Sold=1&LH_Complete=1')
soup = BeautifulSoup(html.read(), 'html.parser')
soldItem = soup.find_all('h3', class_='s-item__title s-item__title--has-tags')
salePrice = soup.find_all('span', class_='POSITIVE')
itemsData = {item.text: price.text for item, price in zip(soldItem, salePrice)}
print(itemsData)

How to specify table for BeautifulSoup to find?

I'm trying to grab the table on this page https://nces.ed.gov/collegenavigator/?id=139755 under the Net Price expandable object. I've gone through tutorials for BS4, but I get so confused by the complexity of the html in this case that I can't figure out what syntax and which tags to use.
Here's a screenshot of the table and html I'm trying to get:
This is what I have so far. How do I add other tags to narrow down the results to just that one table?
import requests
from bs4 import BeautifulSoup
page = requests.get('https://nces.ed.gov/collegenavigator/?id=139755')
soup = BeautifulSoup(page.text, 'html.parser')
soup = soup.find(id="divctl00_cphCollegeNavBody_ucInstitutionMain_ctl02")
print(soup.prettify())
Once I can parse that data, I will format into a dataframe with pandas.
In this case I'd probably just use pandas to retrieve all tables then index in for appropriate
import pandas as pd
table = pd.read_html('https://nces.ed.gov/collegenavigator/?id=139755')[10]
print(table)
If you are worried about future ordering you could loop the tables returned by read_html and test for presence of a unique string to identify table or use bs4 functionality of :has , :contains (bs4 4.7.1+) to identify the right table to then pass to read_html or continue handling with bs4
import pandas as pd
from bs4 import BeautifulSoup as bs
r = requests.get('https://nces.ed.gov/collegenavigator/?id=139755')
soup = bs(r.content, 'lxml')
table = pd.read_html(str(soup.select_one('table:has(td:contains("Average net price"))')))
print(table)
ok , maybe this can help you , I add pandas
import requests
from bs4 import BeautifulSoup
import pandas as pd
page = requests.get('https://nces.ed.gov/collegenavigator/?id=139755')
soup = BeautifulSoup(page.text, 'html.parser')
div = soup.find("div", {"id": "divctl00_cphCollegeNavBody_ucInstitutionMain_ctl02"})
table = div.findAll("table", {"class": "tabular"})[1]
l = []
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
if td:
row = [i.text for i in td]
l.append(row)
df=pd.DataFrame(l, columns=["AVERAGE NET PRICE BY INCOME","2015-2016","2016-2017","2017-2018"])
print(df)
Here is a basic script to scrape that first table in that accordion:
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "https://nces.ed.gov/collegenavigator/?id=139755#netprc"
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
parent_table = soup.find('div', attrs={'id':'netprc'})
desired_table = parent_table.find('table')
print(desired_table.prettify())
I assume you only want the values within the table so I did an overkill version of this as well that will combine the column names and values together:
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "https://nces.ed.gov/collegenavigator/?id=139755#netprc"
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
parent_table = soup.find('div', attrs={'id':'netprc'})
desired_table = parent_table.find('table')
header_row = desired_table.find_all('th')
headers = []
for header in header_row:
header_text = header.get_text()
headers.append(header_text)
money_values = []
data_row =desired_table.find_all('td')
for rows in data_row:
row_text = rows.get_text()
money_values.append(row_text)
for yrs,money in zip(headers,money_values):
print(yrs,money)
This will print out the following:
Average net price
2015-2016 $13,340
2016-2017 $15,873
2017-2018 $16,950

BS "find_all" method does not match all target

When I use find_all method on this page beautiful soup doesn't find all targets.
This code:
len(mySoup.find_all('div', {'class': 'lo-liste row'}))
Returns 1, yet there are 4.
This is the soup url.
When i looked in source code of given link i found there are only 1 div with class name "lo-liste row" other three div is having class name as follows "lo-liste row not-first-ligne" so that's why you got only 1 as output.
try following code
len(soup.findAll('div', {'class': ['lo-liste row','not-first-ligne']}))
enter code here
from bs4 import BeautifulSoup
import requests
page = requests.get("https://www.ubaldi.com/offres/jeu-de-3-disques-feutres-electrolux--ma-92ca565jeud-4b9yz--727536.php")
soup = BeautifulSoup(page.content, 'html.parser')
print(len(soup.findAll('div', {'class': ['lo-liste row','not-first-ligne']})))
The find_all DOES correctly match all targets.
The first product has class=lo-liste row
The next 3 products have class=lo-liste row not-first-ligne
import requests
url = 'https://www.ubaldi.com/offres/jeu-de-3-disques-feutres-electrolux--ma-92ca565jeud-4b9yz--727536.php'
response = requests.get(url)
mySoup = BeautifulSoup(response.text, 'html.parser')
for product in mySoup.find_all('div', {'class': 'lo-liste row'}):
print (product.find('a').find_next('span').text.strip())
for product in mySoup.find_all('div', {'class': 'lo-liste row not-first-ligne'}):
print (product.find('a').find_next('span').text.strip())
# or to combine those 2 for loops into 1
#for product in mySoup.findAll('div', {'class': ['lo-liste row','not-first-ligne']}):
#print (product.find('a').find_next('span').text.strip())
Output:
SOS Accessoire
Stortle
Groupe-Dragon
Asdiscount
Use select instead. It will match on all 4 for that class.
items = soup.select('.lo-liste.row')
Use Regular Expression re to find the element.
from bs4 import BeautifulSoup
import requests
import re
url = 'https://www.ubaldi.com/offres/jeu-de-3-disques-feutres-electrolux--ma-92ca565jeud-4b9yz--727536.php'
html= requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
print(len(soup.find_all('div', class_=re.compile('lo-liste row'))))
Output:
4

Beautiful Soup scrape table with table breaks

I'm trying to scrape a table into a dataframe. My attempt only returns the table name and not the data within rows for each region.
This is what i have so far:
from bs4 import BeautifulSoup as bs4
import requests
url = 'https://www.eia.gov/todayinenergy/prices.php'
r = requests.get(url)
soup = bs4(r.text, "html.parser")
table_regions = soup.find('table', {'class': "t4"})
regions = table_regions.find_all('tr')
for row in regions:
print row
ideal outcome i'd like to get:
region | price
---------------|-------
new england | 2.59
new york city | 2.52
Thanks for any assistance.
If you check your html response (soup) you will see that the table tag you get in this line table_regions = soup.find('table', {'class': "t4"}) its closed up before the rows that contain the information you need (the ones that contain the td's with the class names: up dn d1 and s1.
So how about using the raw td tags like this:
from bs4 import BeautifulSoup as bs4
import requests
import pandas as pd
url = 'https://www.eia.gov/todayinenergy/prices.php'
r = requests.get(url)
soup = bs4(r.text, "html.parser")
a = soup.find_all('tr')
rows = []
subel = []
for tr in a[42:50]:
b = tr.find_all('td')
for td in b:
subel.append(td.string)
rows.append(subel)
subel = []
df = pd.DataFrame(rows, columns=['Region','Price_1', 'Percent_change_1', 'Price_2', 'Percent_change_2', 'Spark Spread'])
Notice that I use just the a[42:50] slice of the results because a contains all the td's of the website. You can use the rest too if you need to.

Categories

Resources