I want to Retrieve a financial dataset from a Website which has a log in. I've managed to log in using requests and access the HTML
from bs4 import BeautifulSoup
import pandas as pd
s = requests.session()
login_data = dict(email='my login', password='password')
s.post('*portal webiste with/login*', data=login_data)
r = s.get(' *website with finacial page* ')
print (r.content)
## work on r as its a direct link
url = r # stock url
page = url
soup = BeautifulSoup(page.text) # returns the htm of the finance page.
The above code allows me to log in and get the html from the correct page.
headers = []
# finds all the headers.
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
df = pd.DataFrame(columns = headers)
print(df)
this block finds the table and gets the column headers.
which are printed as:
Columns: [Date, Type, Type, Credit, Debit, Outstanding, Case File, ]
The next part is the problem. when I attempt to retrieve the financials using the following code:
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip()for td in data]
print(row_data)
it returns this
['"Loading Please Wait..."']
HTML of the site looks like this
html of the site i want to scrape
Related
I would scrape all data-oid tag from this page, but return nothing in the output
Code
url = 'https://www.betexplorer.com/soccer/south-korea/k-league-2/bucheon-fc-1995-jeonnam/EDwej14E/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', class_='table-main')
for rows in table.find_all('tr')[1:]:
for row in rows.find_all('td'):
data = row.get_attrs['data-oid']
print(data)
The part table part of the page is loaded from external URL via JavaScript. To get the data along with the tags with data-oid= parameters, you can use this example:
import requests
from bs4 import BeautifulSoup
url = "https://www.betexplorer.com/soccer/south-korea/k-league-2/bucheon-fc-1995-jeonnam/EDwej14E/"
match_id = "EDwej14E" # <-- this is the last part of URL
api_url = "https://www.betexplorer.com/match-odds/{}/1/1x2/".format(match_id)
headers = {"Referer": "https://www.betexplorer.com"}
data = requests.get(api_url, headers=headers).json()
soup = BeautifulSoup(data["odds"], "html.parser")
# your code:
table = soup.find("table", class_="table-main")
for rows in table.find_all("tr")[1:]:
for row in rows.select("td[data-oid]"):
data = row["data-oid"]
print(data)
Prints:
...
4kqjpxv464x0xc6aif
4kqjpxv464x0xc6aie
4kqjpxv498x0x0
4kqjpxv464x0xc6aif
4kqjpxv464x0xc6aie
4kqjpxv498x0x0
4kqjpxv464x0xc6aif
I am mostly new. I have a go-to formula for scraping sports data, but am unable to get this to work with yahoo. I am able to get my tokens, but then using beautifulsoup to scrape returns nothing. Here is the working part of my script:
# URL to be scraped
url = "https://basketball.fantasysports.yahoo.com/nba/75810/1"
# this is the HTML from the given URL
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
# use findALL() to get the column headers
soup.findAll('tr')
# use getText()to extract the text we need into a list
headers = [th.getText() for th in soup.findAll('tr')[0].findAll('th')]
# avoid the first header row
rows = soup.findAll('tr')[1:]
team_stats_advanced = [[td.getText() for td in rows[i].findAll('td')]
for i in range(len(rows))]
# create data frames
advancedstats = pd.DataFrame(team_stats_advanced[1:], columns = headers)
There is a website which doesn't take queries (hidden), there is an input field with an html id, once u enter value and click submit, you get a single row table.
Is it possible to enter input values in a loop and get the table data by web scraping using python along with beautifulsoup or flask? (Not selenium)
link
Click on Know your class & section
`import requests
import urllib.request
import time
from bs4 import BeautifulSoup
# Set the URL you want to webscrape from
url = 'https://www.pesuacademy.com/Academy'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
#results = soup.find(id = "knowClsSectionModalLoginId")
#R = soup.find(id = 'knowClsSectionModalTableDate')
try:
a = soup.find('input', {'id':'knowClsSectionModalLoginId'}).get('value')
for i in a:
inputv = i.get('value')
print(i, \n)
except:
pass
`
I assume you are referring to "Know your Class & Section". This is a form.
This is an ajax post call with the loginid.
You can give all the ids in list loginids. The script loops through and gets all the data and saves to a csv file.
import requests
from bs4 import BeautifulSoup
import pandas as pd
loginids = ["PES1201900004"]
payload = {
"loginId": ""
}
headers = {
"content-type": "application/x-www-form-urlencoded"
}
url = "https://pesuacademy.com/Academy/getStudentClassInfo"
columns = ['PRN', 'SRN', 'Name', 'Class', 'Section', 'Cycle', 'Department', 'Branch', 'Institute Name']
data = []
for logins in loginids:
payload["loginId"] = logins
res = requests.post(url, data=payload,headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
data.append([i.get_text(strip=True) for i in soup.find("table").find("tbody").find_all("td")])
df = pd.DataFrame(data, columns=columns)
df.to_csv("data.csv", index=False)
print(df)
Output:
PRN SRN Name Class Section Cycle Department Branch Institute Name
0 PES1201900004 NA AKSHAYA RAMESH NA B ARCH
I wanted to try to scrape some specific columns (Company details column) in the CNBC Nasdaq 100 website specifically the Adobe stocks, below is the snippet of my code
# Importing Libraries
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
def get_company_info(url):
original_url = url
key = {}
l = []
page_response = requests.get(url, timeout=240)
page_content = BeautifulSoup(page_response.content, "html.parser")
name = page_content.find('div',{"class":"quote-section-header large-header"}).find("span",{"class":"symbol"}).text
description = page_content.find_all('div',{"class":"moduleBox"})
for items in description:
for i in range(len(items.find_all("tr"))-1):
# Gather data
key["stock_desc"] = items.find_all("td", {"class":"desc"})[i].find('div',attrs={'id':'descLong'}).text
shares = items.find_all("td").find("table",attrs={"id":"shares"})
for rest_of_items in shares:
for i in range(len(items.find_all("tr"))-1):
key["stock_outstanding-shares"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_ownership"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_market_cap"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_lastSplit"] = items.find_all("td", {"class":"bold aRit"})[i].text
# Print ("")
l.append(key)
key['name'] = name
df = pd.DataFrame(l)
print(df)
return key, df
get_company_info("https://www.cnbc.com/quotes/?symbol=ADBE&tab=profile")
So, I'm keen to get the result in dataframe so that I can change to CSV file, but my code keep showing empty dataframe result, Below are the error shown
The result I wanted is something like this
The information you are looking for is not available in the url you requested. This is because the information is fetched by the page using a JavaScript. Which in turn requests a different URL which provides the data.
Example code
from bs4 import BeautifulSoup
import requests
page=requests.get("https://apps.cnbc.com/view.asp?symbol=ADBE.O&uid=stocks/summary")
soup = BeautifulSoup(page.content, 'html.parser')
Name=soup.find("h5",id="companyName").text
stock_desc= soup.find("div",id="descLong").text
table=soup.find("table",id="shares")
details=table.find_all("td", class_="bold aRit")
stock_outstanding_shares= details[0].text
stock_ownership= details[1].text
stock_market_cap= details[2].text
stock_lastSplit= details[3].text
You can create dataframe and export to csv.
I've written a script in python to scrape the tablular content from a webpage. In the first column of the main table there are the names. Some names have links to lead another page, some are just the names without any link. My intention is to parse the rows when a name has no link to another page. However, when the name has link to another page then the script will first parse the concerning rows from the main table and then follow that link to parse associated information of that name from the table located at the bottom under the title Companies. Finally, write them in a csv file.
site link
I've tried so far:
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
link = "https://suite.endole.co.uk/insight/company/ajax_people.php?ajax_url=ajax_people&page=1&company_number=03512889"
base = "https://suite.endole.co.uk"
res = requests.get(link)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select("table tr")[1:]:
if not item.select_one("td a[href]"):
first_table = [i.text for i in item.select("td")]
print(first_table)
else:
first_table = [i.text for i in item.select("td")]
print(first_table)
url = urljoin(base,item.select_one("td a[href]").get("href"))
resp = requests.get(url)
soup_ano = BeautifulSoup(resp.text,"lxml")
for elems in soup_ano.select(".content:contains(Companies) table tr")[1:]:
associated_info = [elem.text for elem in elems.select("td")]
print(associated_info)
My above script can do almost everything but I can't create any logic to print once rather than printing thrice to get all the data atltogether so that I can write them in a csv file.
Put all your scraped data into a list, here I've called the list associated_info then all the data is in one place & you can iterate over the list to print it out to a CSV if you like...
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
link = "https://suite.endole.co.uk/insight/company/ajax_people.php?ajax_url=ajax_people&page=1&company_number=03512889"
base = "https://suite.endole.co.uk"
res = requests.get(link)
soup = BeautifulSoup(res.text,"lxml")
associated_info = []
for item in soup.select("table tr")[1:]:
if not item.select_one("td a[href]"):
associated_info.append([i.text for i in item.select("td")])
else:
associated_info.append([i.text for i in item.select("td")])
url = urljoin(base,item.select_one("td a[href]").get("href"))
resp = requests.get(url)
soup_ano = BeautifulSoup(resp.text,"lxml")
for elems in soup_ano.select(".content:contains(Companies) table tr")[1:]:
associated_info.append([elem.text for elem in elems.select("td")])
print(associated_info)