How to extract table data from a website only AFTER inputting data? - python

There is a website which doesn't take queries (hidden), there is an input field with an html id, once u enter value and click submit, you get a single row table.
Is it possible to enter input values in a loop and get the table data by web scraping using python along with beautifulsoup or flask? (Not selenium)
link
Click on Know your class & section
`import requests
import urllib.request
import time
from bs4 import BeautifulSoup
# Set the URL you want to webscrape from
url = 'https://www.pesuacademy.com/Academy'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
#results = soup.find(id = "knowClsSectionModalLoginId")
#R = soup.find(id = 'knowClsSectionModalTableDate')
try:
a = soup.find('input', {'id':'knowClsSectionModalLoginId'}).get('value')
for i in a:
inputv = i.get('value')
print(i, \n)
except:
pass
`

I assume you are referring to "Know your Class & Section". This is a form.
This is an ajax post call with the loginid.
You can give all the ids in list loginids. The script loops through and gets all the data and saves to a csv file.
import requests
from bs4 import BeautifulSoup
import pandas as pd
loginids = ["PES1201900004"]
payload = {
"loginId": ""
}
headers = {
"content-type": "application/x-www-form-urlencoded"
}
url = "https://pesuacademy.com/Academy/getStudentClassInfo"
columns = ['PRN', 'SRN', 'Name', 'Class', 'Section', 'Cycle', 'Department', 'Branch', 'Institute Name']
data = []
for logins in loginids:
payload["loginId"] = logins
res = requests.post(url, data=payload,headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
data.append([i.get_text(strip=True) for i in soup.find("table").find("tbody").find_all("td")])
df = pd.DataFrame(data, columns=columns)
df.to_csv("data.csv", index=False)
print(df)
Output:
PRN SRN Name Class Section Cycle Department Branch Institute Name
0 PES1201900004 NA AKSHAYA RAMESH NA B ARCH

Related

If duplicate URL entry exists don't append the data (BeautifulSoup to Google Sheets)

I was wondering if you can help.
I'm using beautifulsoup to write to Google Sheets.
I've created a crawler that runs through a series of URLs, scrapes the content and then updates a Google sheet.
What I now want to do is if a duplicate URL exists (in column c) to prevent it from being written to my sheet again.
e.g If I had the url https://www.bbc.co.uk/1 in my table I wouldn't want it appearing in my table again.
Here is my code:
from cgitb import text
import requests
from bs4 import BeautifulSoup
import gspread
import datetime
import urllib.parse
gc = gspread.service_account(filename='creds.json')
sh = gc.open('scrapetosheets').sheet1
urls = ["https://www.ig.com/uk/trading-strategies", "https://www.ig.com/us/trading-strategies"]
for url in urls:
my_url = requests.get(url)
html = my_url.content
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('h3', class_="article-category-section-title"):
date = datetime.datetime.now()
title = item.find('a', class_ = 'primary js_target').text.strip()
url = item.find('a', class_ = 'primary js_target').get('href')
abs = "https://www.ig.com"
rel = url
info = {'date':date, 'title':title, 'url':urllib.parse.urljoin(abs, rel)}
sh.append_row([str(info['date']), str(info['title']), str(info['url'])])
Thanks in advance.
Mark
I'd like to know what i can add to the end of my code to prevent duplicate URLs being entered into my Google Sheet.
I believe your goal is as follows.
You want to put the values of [str(info['date']), str(info['title']), str(info['url'])], when the value of str(info['url']) is not existing in the column "C".
Modification points:
In this case, it is required to check the column "C" of the existing sheet of sh = gc.open('scrapetosheets').sheet1. This has already been mentioned in the TheMaster's comment.
When I saw your script, append_row is used in a loop. In this case, the process cost will become high.
When these points are reflected in your script, how about the following modification?
Modified script:
from cgitb import text
import requests
from bs4 import BeautifulSoup
import gspread
import datetime
import urllib.parse
gc = gspread.service_account(filename='creds.json')
sh = gc.open('scrapetosheets').sheet1
urls = ["https://www.ig.com/uk/trading-strategies", "https://www.ig.com/us/trading-strategies"]
# I modified the below script.
obj = {r[2]: True for r in sh.get_all_values()}
ar = []
for url in urls:
my_url = requests.get(url)
html = my_url.content
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all("h3", class_="article-category-section-title"):
date = datetime.datetime.now()
title = item.find("a", class_="primary js_target").text.strip()
url = item.find("a", class_="primary js_target").get("href")
abs = "https://www.ig.com"
rel = url
info = {"date": date, "title": title, "url": urllib.parse.urljoin(abs, rel)}
url = str(info["url"])
if url not in obj:
ar.append([str(info["date"]), str(info["title"]), url])
if ar != []:
sh.append_rows(ar, value_input_option="USER_ENTERED")
When this script is run, first, the values are retrieved from the sheet, and create an object for searching the value of str(info["url"]). When the value of str(info["url"]) is not existing in column "C" of the sheet, the values are put into an array. And then, the array is appended to the sheet.
Reference:
append_rows

Beautiful Soup not returing data within a table

I want to Retrieve a financial dataset from a Website which has a log in. I've managed to log in using requests and access the HTML
from bs4 import BeautifulSoup
import pandas as pd
s = requests.session()
login_data = dict(email='my login', password='password')
s.post('*portal webiste with/login*', data=login_data)
r = s.get(' *website with finacial page* ')
print (r.content)
## work on r as its a direct link
url = r # stock url
page = url
soup = BeautifulSoup(page.text) # returns the htm of the finance page.
The above code allows me to log in and get the html from the correct page.
headers = []
# finds all the headers.
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
df = pd.DataFrame(columns = headers)
print(df)
this block finds the table and gets the column headers.
which are printed as:
Columns: [Date, Type, Type, Credit, Debit, Outstanding, Case File, ]
The next part is the problem. when I attempt to retrieve the financials using the following code:
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip()for td in data]
print(row_data)
it returns this
['"Loading Please Wait..."']
HTML of the site looks like this
html of the site i want to scrape

Attempting to export parsed data to CSV file with Python and I can't figure out how to export more than one row

I'm fairly new to beautiful soup/Python/Web Scraping and I have been able to scrape data from a site, but I am only able to export the very first row to a csv file ( I want to export all scraped data into the file.)
I am stumped on how to make this code export ALL scraped data into multiple individual rows:
r = requests.get("https://www.infoplease.com/primary-sources/government/presidential-speeches/state-union-addresses")
data = r.content # Content of response
soup = BeautifulSoup(data, "html.parser")
for span in soup.find_all("span", {"class": "article"}):
for link in span.select("a"):
name_and_date = link.text.split('(')
name = name_and_date[0].strip()
date = name_and_date[1].replace(')','').strip()
base_url = "https://www.infoplease.com"
links = link['href']
links = urljoin(base_url, links)
pres_data = {'Name': [name],
'Date': [date],
'Link': [links]
}
df = pd.DataFrame(pres_data, columns= ['Name', 'Date', 'Link'])
df.to_csv (r'C:\Users\ThinkPad\Documents\data_file.csv', index = False, header=True)
print (df)
Any ideas here? I believe I need to loop it through the data parsing and grab each set and push it in.
Am I going about this the right way?
Thanks for any insight
The way it is currently set up, it looks like you are not adding each link as a new entry and instead it is only adding the last link. If you initialize a list and add a dictionary like you have it set up for each iteration of the "links" for loop, you will add each row and not just the last one.
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
r = requests.get("https://www.infoplease.com/primary-sources/government/presidential-speeches/state-union-addresses")
data = r.content # Content of response
soup = BeautifulSoup(data, "html.parser")
pres_data = []
for span in soup.find_all("span", {"class": "article"}):
for link in span.select("a"):
name_and_date = link.text.split('(')
name = name_and_date[0].strip()
date = name_and_date[1].replace(')','').strip()
base_url = "https://www.infoplease.com"
links = link['href']
links = urljoin(base_url, links)
this_data = {'Name': name,
'Date': date,
'Link': links
}
pres_data.append(this_data)
df = pd.DataFrame(pres_data, columns= ['Name', 'Date', 'Link'])
df.to_csv (r'C:\Users\ThinkPad\Documents\data_file.csv', index = False, header=True)
print (df)
You don't need to use Pandas here since you are not willing to apply any kind of Data operation there!
Usually try to limit yourself on the builtin libraries in case if the task is shorter.
import requests
from bs4 import BeautifulSoup
import csv
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
target = [([x.a['href']] + x.a.text[:-1].split(' ('))
for x in soup.select('span.article')]
with open('data.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Url', 'Name', 'Date'])
writer.writerows(target)
main('https://www.infoplease.com/primary-sources/government/presidential-speeches/state-union-addresses')
Sample of output:

I am trying to scrape multiple tables from 30 similar links using Python

I have 10 links of companies.
https://www.zaubacorp.com/company/ASHRAFI-MEDIA-NETWORK-PRIVATE-LIMITED/U22120GJ2019PTC111757,
https://www.zaubacorp.com/company/METTLE-PUBLICATIONS-PRIVATE-LIMITED/U22120MH2019PTC329729,
https://www.zaubacorp.com/company/PRINTSCAPE-INDIA-PRIVATE-LIMITED/U22120MH2020PTC335354,
https://www.zaubacorp.com/company/CHARVAKA-TELEVISION-NETWORK-PRIVATE-LIMITED/U22121KA2019PTC126665,
https://www.zaubacorp.com/company/BHOOKA-NANGA-FILMS-PRIVATE-LIMITED/U22130DL2019PTC353194,
https://www.zaubacorp.com/company/WHITE-CAMERA-SCHOOL-OF-PHOTOGRAPHY-PRIVATE-LIMITED/U22130JH2019PTC013311,
https://www.zaubacorp.com/company/RLE-PRODUCTIONS-PRIVATE-LIMITED/U22130KL2019PTC059208,
https://www.zaubacorp.com/company/CATALIZADOR-MEDIA-PRIVATE-LIMITED/U22130KL2019PTC059793,
https://www.zaubacorp.com/company/TRIPPLED-MEDIAWORKS-OPC-PRIVATE-LIMITED/U22130MH2019OPC333171,
https://www.zaubacorp.com/company/KRYSTAL-CINEMAZ-PRIVATE-LIMITED/U22130MH2019PTC330391
Now I am trying to scrape tables from these links and save the data in csv columns in well manner formet. I want to scrape tables of "Company Details", "Share Capital & Number of Employees", "Listing and Annual Compliance Details", "Contact Details", "Director Details". If any table has not the data or if any column is missing I want that column blank in output csv file. I have written a code but can't get the output. I am doing something wrong here. Please help
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import csv
import lxml
url_file = "Zaubalinks.txt"
with open(url_file, "r") as url:
url_pages = url.read()
# we need to split each urls into lists to make it iterable
pages = url_pages.split("\n") # Split by lines using \n
# now we run a for loop to visit the urls one by one
data = []
for single_page in pages:
r = requests.get(single_page)
soup = BeautifulSoup(r.content, 'html5lib')
table = soup.find_all('table') # finds all tables
table_top = pd.read_html(str(table))[0] # the top table
try: # try to get the other table if exists
table_capital = pd.read_html(str(table))[5]
table_listing = pd.read_html(str(table))[6]
table_contact = pd.read_html(str(table))[7]
table_director = pd.read_html(str(table))[8]
except:
table_capital = pd.DataFrame()
table_listing = pd.DataFrame()
table_contact = pd.DataFrame()
table_director = pd.DataFrame()
result = pd.concat([table_top, table_capital, table_listing, table_contact, table_director])
data.append(result)
print(data)
pd.concat(data).to_csv('ZaubaAll.csv')
import requests
from bs4 import BeautifulSoup
import pandas as pd
companies = {
'ASHRAFI-MEDIA-NETWORK-PRIVATE-LIMITED/U22120GJ2019PTC111757',
'METTLE-PUBLICATIONS-PRIVATE-LIMITED/U22120MH2019PTC329729',
'PRINTSCAPE-INDIA-PRIVATE-LIMITED/U22120MH2020PTC335354',
'CHARVAKA-TELEVISION-NETWORK-PRIVATE-LIMITED/U22121KA2019PTC126665',
'BHOOKA-NANGA-FILMS-PRIVATE-LIMITED/U22130DL2019PTC353194',
'WHITE-CAMERA-SCHOOL-OF-PHOTOGRAPHY-PRIVATE-LIMITED/U22130JH2019PTC013311',
'RLE-PRODUCTIONS-PRIVATE-LIMITED/U22130KL2019PTC059208',
'CATALIZADOR-MEDIA-PRIVATE-LIMITED/U22130KL2019PTC059793',
'TRIPPLED-MEDIAWORKS-OPC-PRIVATE-LIMITED/U22130MH2019OPC333171',
'KRYSTAL-CINEMAZ-PRIVATE-LIMITED/U22130MH2019PTC330391'
}
def main(url):
with requests.Session() as req:
goal = []
for company in companies:
r = req.get(url.format(company))
df = pd.read_html(r.content)
target = pd.concat([df[x].T for x in [0, 3, 4]], axis=1)
goal.append(target)
new = pd.concat(goal)
new.to_csv("data.csv")
main("https://www.zaubacorp.com/company/{}")
Fortunatley, it seems you can get there with simpler methods. Taking one reandom link as an example, it should be something like:
url = 'https://www.zaubacorp.com/company/CHARVAKA-TELEVISION-NETWORK-PRIVATE-LIMITED/U22121KA2019PTC126665'
import pandas as pd
tables = pd.read_html(url)
From here, your tables are in tables[0], tables[3], tables[4], tables[15], etc. Just use a for loop to rotate through all the urls.

Empty Dataframe when scraping specific column from website

I wanted to try to scrape some specific columns (Company details column) in the CNBC Nasdaq 100 website specifically the Adobe stocks, below is the snippet of my code
# Importing Libraries
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
def get_company_info(url):
original_url = url
key = {}
l = []
page_response = requests.get(url, timeout=240)
page_content = BeautifulSoup(page_response.content, "html.parser")
name = page_content.find('div',{"class":"quote-section-header large-header"}).find("span",{"class":"symbol"}).text
description = page_content.find_all('div',{"class":"moduleBox"})
for items in description:
for i in range(len(items.find_all("tr"))-1):
# Gather data
key["stock_desc"] = items.find_all("td", {"class":"desc"})[i].find('div',attrs={'id':'descLong'}).text
shares = items.find_all("td").find("table",attrs={"id":"shares"})
for rest_of_items in shares:
for i in range(len(items.find_all("tr"))-1):
key["stock_outstanding-shares"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_ownership"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_market_cap"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_lastSplit"] = items.find_all("td", {"class":"bold aRit"})[i].text
# Print ("")
l.append(key)
key['name'] = name
df = pd.DataFrame(l)
print(df)
return key, df
get_company_info("https://www.cnbc.com/quotes/?symbol=ADBE&tab=profile")
So, I'm keen to get the result in dataframe so that I can change to CSV file, but my code keep showing empty dataframe result, Below are the error shown
The result I wanted is something like this
The information you are looking for is not available in the url you requested. This is because the information is fetched by the page using a JavaScript. Which in turn requests a different URL which provides the data.
Example code
from bs4 import BeautifulSoup
import requests
page=requests.get("https://apps.cnbc.com/view.asp?symbol=ADBE.O&uid=stocks/summary")
soup = BeautifulSoup(page.content, 'html.parser')
Name=soup.find("h5",id="companyName").text
stock_desc= soup.find("div",id="descLong").text
table=soup.find("table",id="shares")
details=table.find_all("td", class_="bold aRit")
stock_outstanding_shares= details[0].text
stock_ownership= details[1].text
stock_market_cap= details[2].text
stock_lastSplit= details[3].text
You can create dataframe and export to csv.

Categories

Resources