I have scraped some updated day-by-day data (only numbers). I want to show them in a good table (data frame). I don't know how to use Pandas. I am using python and the end result should look like a table with defined keys on it. Thanks
And here is my python code:
import requests
from bs4 import BeautifulSoup
url = 'https://www.worldometers.info/coronavirus/country/Austria/'
page = requests.get(url)
soup = BeautifulSoup(page.text , 'html.parser')
#RECOVERD , DEATHS AND TOTAL CASES
Covid_Cases_Array = []
get_Covid_Cases = soup.find_all(class_ = 'maincounter-number')
for item in get_Covid_Cases:
Covid_Cases_Array.append(item.text)
print(item.text)
# Active ND CLOSED DATA
Covid_Active_Closed = []
get_Activ_Closed = soup.find_all(class_ = 'number-table-main')
for item in get_Activ_Closed:
Covid_Active_Closed.append(item.text)
print(item.text)
And the result of that code :
600,089
9,997
563,256
26,836
573,253
You can use this example how to get the data from the page to DataFrame:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.worldometers.info/coronavirus/country/Austria/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
cases, deaths, recovered = soup.select(".maincounter-number")
active_cases, closed_cases = soup.select(".number-table-main")
active_cases_mild, active_cases_serious, _, _ = soup.select(".number-table")
df = pd.DataFrame(
{
"Coronavirus Cases": [cases.get_text(strip=True)],
"Deaths": [deaths.get_text(strip=True)],
"Recovered": [recovered.get_text(strip=True)],
"Currently infected": [active_cases.get_text(strip=True)],
"Closed cases": [closed_cases.get_text(strip=True)],
"Active cases (mild)": [active_cases_mild.get_text(strip=True)],
"Active cases (serious)": [active_cases_serious.get_text(strip=True)],
}
)
print(df)
Prints:
Coronavirus Cases Deaths Recovered Currently infected Closed cases Active cases (mild) Active cases (serious)
0 600,089 9,997 563,256 26,836 573,253 26,279 557
Related
I have been doing my web scraping project with this web link. But the code runs with no errors. But its ot showing any records. Can you pls check the reason for the same?
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://live-cosmos.finq.com/trading-platform/#trading/Shares/Global/USA/All/FACEBOOK"
data = requests.get(url).text
soup = BeautifulSoup(data, 'html5lib')
df = pd.DataFrame(columns=["Instrument", "Sell", "Buy", "Change"])
for row in soup.find_all('tr'):
col = row.find_all("td")
Instrument = col[0].text
Sell = col[1].text
Buy = col[2].text
Change = col[3].text
df = df.append({"Instrument":Instrument,"Sell":Sell,"Buy":Buy,"Change":Change}, ignore_index=True)
print(df)
Thanks
I have 10 links of companies.
https://www.zaubacorp.com/company/ASHRAFI-MEDIA-NETWORK-PRIVATE-LIMITED/U22120GJ2019PTC111757,
https://www.zaubacorp.com/company/METTLE-PUBLICATIONS-PRIVATE-LIMITED/U22120MH2019PTC329729,
https://www.zaubacorp.com/company/PRINTSCAPE-INDIA-PRIVATE-LIMITED/U22120MH2020PTC335354,
https://www.zaubacorp.com/company/CHARVAKA-TELEVISION-NETWORK-PRIVATE-LIMITED/U22121KA2019PTC126665,
https://www.zaubacorp.com/company/BHOOKA-NANGA-FILMS-PRIVATE-LIMITED/U22130DL2019PTC353194,
https://www.zaubacorp.com/company/WHITE-CAMERA-SCHOOL-OF-PHOTOGRAPHY-PRIVATE-LIMITED/U22130JH2019PTC013311,
https://www.zaubacorp.com/company/RLE-PRODUCTIONS-PRIVATE-LIMITED/U22130KL2019PTC059208,
https://www.zaubacorp.com/company/CATALIZADOR-MEDIA-PRIVATE-LIMITED/U22130KL2019PTC059793,
https://www.zaubacorp.com/company/TRIPPLED-MEDIAWORKS-OPC-PRIVATE-LIMITED/U22130MH2019OPC333171,
https://www.zaubacorp.com/company/KRYSTAL-CINEMAZ-PRIVATE-LIMITED/U22130MH2019PTC330391
Now I am trying to scrape tables from these links and save the data in csv columns in well manner formet. I want to scrape tables of "Company Details", "Share Capital & Number of Employees", "Listing and Annual Compliance Details", "Contact Details", "Director Details". If any table has not the data or if any column is missing I want that column blank in output csv file. I have written a code but can't get the output. I am doing something wrong here. Please help
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import csv
import lxml
url_file = "Zaubalinks.txt"
with open(url_file, "r") as url:
url_pages = url.read()
# we need to split each urls into lists to make it iterable
pages = url_pages.split("\n") # Split by lines using \n
# now we run a for loop to visit the urls one by one
data = []
for single_page in pages:
r = requests.get(single_page)
soup = BeautifulSoup(r.content, 'html5lib')
table = soup.find_all('table') # finds all tables
table_top = pd.read_html(str(table))[0] # the top table
try: # try to get the other table if exists
table_capital = pd.read_html(str(table))[5]
table_listing = pd.read_html(str(table))[6]
table_contact = pd.read_html(str(table))[7]
table_director = pd.read_html(str(table))[8]
except:
table_capital = pd.DataFrame()
table_listing = pd.DataFrame()
table_contact = pd.DataFrame()
table_director = pd.DataFrame()
result = pd.concat([table_top, table_capital, table_listing, table_contact, table_director])
data.append(result)
print(data)
pd.concat(data).to_csv('ZaubaAll.csv')
import requests
from bs4 import BeautifulSoup
import pandas as pd
companies = {
'ASHRAFI-MEDIA-NETWORK-PRIVATE-LIMITED/U22120GJ2019PTC111757',
'METTLE-PUBLICATIONS-PRIVATE-LIMITED/U22120MH2019PTC329729',
'PRINTSCAPE-INDIA-PRIVATE-LIMITED/U22120MH2020PTC335354',
'CHARVAKA-TELEVISION-NETWORK-PRIVATE-LIMITED/U22121KA2019PTC126665',
'BHOOKA-NANGA-FILMS-PRIVATE-LIMITED/U22130DL2019PTC353194',
'WHITE-CAMERA-SCHOOL-OF-PHOTOGRAPHY-PRIVATE-LIMITED/U22130JH2019PTC013311',
'RLE-PRODUCTIONS-PRIVATE-LIMITED/U22130KL2019PTC059208',
'CATALIZADOR-MEDIA-PRIVATE-LIMITED/U22130KL2019PTC059793',
'TRIPPLED-MEDIAWORKS-OPC-PRIVATE-LIMITED/U22130MH2019OPC333171',
'KRYSTAL-CINEMAZ-PRIVATE-LIMITED/U22130MH2019PTC330391'
}
def main(url):
with requests.Session() as req:
goal = []
for company in companies:
r = req.get(url.format(company))
df = pd.read_html(r.content)
target = pd.concat([df[x].T for x in [0, 3, 4]], axis=1)
goal.append(target)
new = pd.concat(goal)
new.to_csv("data.csv")
main("https://www.zaubacorp.com/company/{}")
Fortunatley, it seems you can get there with simpler methods. Taking one reandom link as an example, it should be something like:
url = 'https://www.zaubacorp.com/company/CHARVAKA-TELEVISION-NETWORK-PRIVATE-LIMITED/U22121KA2019PTC126665'
import pandas as pd
tables = pd.read_html(url)
From here, your tables are in tables[0], tables[3], tables[4], tables[15], etc. Just use a for loop to rotate through all the urls.
There is a website which doesn't take queries (hidden), there is an input field with an html id, once u enter value and click submit, you get a single row table.
Is it possible to enter input values in a loop and get the table data by web scraping using python along with beautifulsoup or flask? (Not selenium)
link
Click on Know your class & section
`import requests
import urllib.request
import time
from bs4 import BeautifulSoup
# Set the URL you want to webscrape from
url = 'https://www.pesuacademy.com/Academy'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
#results = soup.find(id = "knowClsSectionModalLoginId")
#R = soup.find(id = 'knowClsSectionModalTableDate')
try:
a = soup.find('input', {'id':'knowClsSectionModalLoginId'}).get('value')
for i in a:
inputv = i.get('value')
print(i, \n)
except:
pass
`
I assume you are referring to "Know your Class & Section". This is a form.
This is an ajax post call with the loginid.
You can give all the ids in list loginids. The script loops through and gets all the data and saves to a csv file.
import requests
from bs4 import BeautifulSoup
import pandas as pd
loginids = ["PES1201900004"]
payload = {
"loginId": ""
}
headers = {
"content-type": "application/x-www-form-urlencoded"
}
url = "https://pesuacademy.com/Academy/getStudentClassInfo"
columns = ['PRN', 'SRN', 'Name', 'Class', 'Section', 'Cycle', 'Department', 'Branch', 'Institute Name']
data = []
for logins in loginids:
payload["loginId"] = logins
res = requests.post(url, data=payload,headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
data.append([i.get_text(strip=True) for i in soup.find("table").find("tbody").find_all("td")])
df = pd.DataFrame(data, columns=columns)
df.to_csv("data.csv", index=False)
print(df)
Output:
PRN SRN Name Class Section Cycle Department Branch Institute Name
0 PES1201900004 NA AKSHAYA RAMESH NA B ARCH
I am scraping a website table form https://csr.gov.in/companyprofile.php?year=FY+2015-16&CIN=L00000CH1990PLC010573 but I am not getting the exact result I am looking for. I want 11 columns from this link, "company name", "Class", "State", "Company Type", "RoC", "Sub Category", "Listing Status". These are 7 columns and after that you can see an expand button " CSR Details of FY 2017-18" when you will click on that button you will get 4 more columns "Average Net Profit", "CSR Prescribed Expenditure", "CSR Spent", "Local Area Spent". I want all these columns in csv file. I wrote a code and it is not working properly. I am attaching an Image of result for refference. and here is my code. please help to get these data.
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import csv
driver = webdriver.Chrome()
url_file = "csrdata.txt"
with open(url_file, "r") as url:
url_pages = url.read()
# we need to split each urls into lists to make it iterable
pages = url_pages.split("\n") # Split by lines using \n
data = []
# now we run a for loop to visit the urls one by one
for single_page in pages:
driver.get(single_page)
r = requests.get(single_page)
soup = BeautifulSoup(r.content, 'html5lib')
driver.find_element_by_link_text("CSR Details of FY 2017-18").click()
table = driver.find_elements_by_xpath("//*[contains(#id,'colfy4')]")
about = table.__getitem__(0).text
x = about.split('\n')
print(x)
data.append(x)
df = pd.DataFrame(data)
print(df)
# write to csv
df.to_csv('csr.csv')
You dont need to use selenium since all the informations are inside the html code. Also you can use pandas inbuild function pd_read_html() to directly transform the html-table into a dataframe.
data = []
for single_page in pages:
r = requests.get(single_page)
soup = BeautifulSoup(r.content, 'html5lib')
table = soup.find_all('table') #finds all tables
table_top = pd.read_html(str(table))[0] #the top table
try: #try to get the other table if exists
table_extra = pd.read_html(str(table))[7]
except:
table_extra = pd.DataFrame()
result = pd.concat([table_top, table_extra])
data.append(result)
pd.concat(data).to_csv('test.csv')
output:
0 1
0 Class Public
1 State Chandigarh
2 Company Type Other than Govt.
3 RoC RoC-Chandigarh
4 Sub Category Company limited by shares
5 Listing Status Listed
0 Average Net Profit 0
1 CSR Prescribed Expenditure 0
2 CSR Spent 0
3 Local Area Spent 0
I wanted to try to scrape some specific columns (Company details column) in the CNBC Nasdaq 100 website specifically the Adobe stocks, below is the snippet of my code
# Importing Libraries
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
def get_company_info(url):
original_url = url
key = {}
l = []
page_response = requests.get(url, timeout=240)
page_content = BeautifulSoup(page_response.content, "html.parser")
name = page_content.find('div',{"class":"quote-section-header large-header"}).find("span",{"class":"symbol"}).text
description = page_content.find_all('div',{"class":"moduleBox"})
for items in description:
for i in range(len(items.find_all("tr"))-1):
# Gather data
key["stock_desc"] = items.find_all("td", {"class":"desc"})[i].find('div',attrs={'id':'descLong'}).text
shares = items.find_all("td").find("table",attrs={"id":"shares"})
for rest_of_items in shares:
for i in range(len(items.find_all("tr"))-1):
key["stock_outstanding-shares"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_ownership"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_market_cap"] = items.find_all("td", {"class":"bold aRit"})[i].text
key["stock_lastSplit"] = items.find_all("td", {"class":"bold aRit"})[i].text
# Print ("")
l.append(key)
key['name'] = name
df = pd.DataFrame(l)
print(df)
return key, df
get_company_info("https://www.cnbc.com/quotes/?symbol=ADBE&tab=profile")
So, I'm keen to get the result in dataframe so that I can change to CSV file, but my code keep showing empty dataframe result, Below are the error shown
The result I wanted is something like this
The information you are looking for is not available in the url you requested. This is because the information is fetched by the page using a JavaScript. Which in turn requests a different URL which provides the data.
Example code
from bs4 import BeautifulSoup
import requests
page=requests.get("https://apps.cnbc.com/view.asp?symbol=ADBE.O&uid=stocks/summary")
soup = BeautifulSoup(page.content, 'html.parser')
Name=soup.find("h5",id="companyName").text
stock_desc= soup.find("div",id="descLong").text
table=soup.find("table",id="shares")
details=table.find_all("td", class_="bold aRit")
stock_outstanding_shares= details[0].text
stock_ownership= details[1].text
stock_market_cap= details[2].text
stock_lastSplit= details[3].text
You can create dataframe and export to csv.