I would like to obtain a table that appears on a URL: https://www.coronavirus.vic.gov.au/exposure-sites
When right-clicking and inspecting the element, it is evident there is a table element with a class that can be referenced. However, when requested this does not appear.
Reproducible example:
import pandas as pd
import requests
from bs4 import BeautifulSoup
header = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
link = 'https://www.coronavirus.vic.gov.au/exposure-sites'
r = requests.get(link, headers=header)
soup = BeautifulSoup(r.text, "html5lib")
htmltable = soup.find('table', { 'class' : "rpl-row rpl-search-results-layout__main rpl-row--gutter" })
# Error Appears here because the above doesn't exist even though it should?
print(htmltable)
def tableDataText(table):
"""Parses a html segment started with tag <table> followed
by multiple <tr> (table rows) and inner <td> (table data) tags.
It returns a list of rows with inner columns.
Accepts only one <th> (table header/data) in the first row.
"""
def rowgetDataText(tr, coltag='td'): # td (data) or th (header)
return [td.get_text(strip=True) for td in tr.find_all(coltag)]
rows = []
trs = table.find_all('tr')
headerow = rowgetDataText(trs[0], 'th')
if headerow: # if there is a header row include first
rows.append(headerow)
trs = trs[1:]
for tr in trs: # for every table row
rows.append(rowgetDataText(tr, 'td') ) # data row
return rows
list_table = tableDataText(htmltable)
df = pd.DataFrame(list_table[1:], columns=list_table[0])
df
The end state should be a table which is a collection of the 18 pages of tables on the webpage.
You can make call to this URL to get data as json format which returns list of dictionary data and loop over it data can be extraced from using key associated to it
import requests
from bs4 import BeautifulSoup
res=requests.get(" https://www.coronavirus.vic.gov.au/sdp-ckan?resource_id=afb52611-6061-4a2b-9110-74c920bede77&limit=10000")
data=res.json()
main_data=data['result']['records']
for i in range(len(main_data)):
print(main_data[i]['Suburb'])
print(main_data[i]['Site_title'])
Output:
Newport
TyrePlus Newport
Newport
TyrePlus Newport
Newport
...
How to find URL go to Chrome Developer mode and Network tab refresh your site and find data from image (lef hand side) and from preview you will get to kown about URL
Image:
For Dataframe:
import pandas as pd
df=pd.DataFrame(main_data)
Related
I am trying to run this code in idle 3.10.6 and I am not seeing any kind of data that should be extracted from Indeed. All this data should be in the output when I run it but it isn't. Below is the input statement
#Indeed data
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(page):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko"}
url = "https://www.indeed.com/jobs?q=Data&l=United+States&sc=0kf%3Ajt%28internship%29%3B&vjk=a2f49853f01db3cc={page}"
r = requests.get(url,headers)
soup = BeautifulSoup(r.content, "html.parser")
return soup
def transform(soup):
divs = soup.find_all("div", class_ = "jobsearch-SerpJobCard")
for item in divs:
title = item.find ("a").text.strip()
company = item.find("span", class_="company").text.strip()
try:
salary = item.find("span", class_ = "salarytext").text.strip()
finally:
salary = ""
summary = item.find("div",{"class":"summary"}).text.strip().replace("\n","")
job = {
"title":title,
"company":company,
'salary':salary,
"summary":summary
}
joblist.append(job)
joblist = []
for i in range(0,40,10):
print(f'Getting page, {i}')
c = extract(10)
transform(c)
df = pd.DataFrame(joblist)
print(df.head())
df.to_csv('jobs.csv')
Here is the output I get
Getting page, 0
Getting page, 10
Getting page, 20
Getting page, 30
Empty DataFrame
Columns: []
Index: []
Why is this going on and what should I do to get that extracted data from indeed? What I am trying to get is the jobtitle,company,salary, and summary information. Any help would be greatly apprieciated.
The URL string includes {page}, bit it's not an f-string, so it's not being interpolated, and the URL you are fetching is:
https://www.indeed.com/jobs?q=Data&l=United+States&sc=0kf%3Ajt%28internship%29%3B&vjk=a2f49853f01db3cc={page}
That returns an error page.
So you should add an f before opening quote when you set url.
Also, you are calling extract(10) each time, instead of extract(i).
This is the correct way of using url
url = "https://www.indeed.com/jobs?q=Data&l=United+States&sc=0kf%3Ajt%28internship%29%3B&vjk=a2f49853f01db3cc={page}".format(page=page)
r = requests.get(url,headers)
here r.status_code gives an error 403 which means the request is forbidden.The site will block your request from fullfilling.use indeed job search Api
I am trying to web-scrape all Track, Roblox ID, Rating from https://robloxsong.com/ and want them into pandas DataFrame. But when I tried the below code it gives me a single list with all Track, ID, Rating with "\n". Also, I want unable to jump through all 50 pages and get all data.
#Importing
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
webD = webdriver.Chrome(ChromeDriverManager().install())
webD.get('https://robloxsong.com/')
#Loding data form songs tag
elements = webD.find_elements_by_class_name('songs')
#Declaring DataFrame
result = pd.DataFrame(columns = ['Track','Roblox_ID','Rating'])
#Extracting Text
listOfElements = []
for i in elements:
listOfElements.append(i.text)
print(listOfElements)
When I print listOfElements below is the output
>>> ["Track Roblox ID Rating\negg\n5128532009\nCopy\n30267\nCaillou Trap Remix\n212675193\nCopy\n26550\nZEROTWOOOOO\n3951847031\nCopy\n26045\nRUNNING IN THE OOFS! (EPIC)\n1051512943\nCopy\n25938\nSPOOKY SCARY SKELETONS (100,000+ sales)\n160442087\nCopy\n24106\nBanana Song (I'm A Banana)\n169360242\nCopy\n23065\nshrek anthem\n152828706\nCopy\n22810\nraining tacos\n142376088\nCopy\n19135\nGFMO - Hello (100k!!)\n214902446\nCopy\n19118\nWide Put in Walking Audio\n5356051569\nCopy\n13472\nRaining Tacos. (Original)\n142295308\nCopy\n13235\nNARWHALS\n130872377\nCopy\n12858\nOld Town Road\n2862170886\nCopy\n11888\nno\n130786686\nCopy\n11570\nCRAB RAVE OOF\n2590490779\nCopy\n11551\nKFC is illuminati confirmed ( ͡° ͜ʖ ͡° )\n205254380\nCopy\n10668\nNightcore - Titanium\n398159550\nCopy\n10667\nHelp Me Help You Logan Paul\n833322858\nCopy\n10631\nI Like Trains\n131072261\nCopy\n10271\nI'm Fine.\n513919776\nCopy\n9289\nAINT NOBODY GOT TIME FOR DAT\n130776739\nCopy\n9093\nRoxanne\n4277136473\nCopy\n8912\nFlamingo Intro\n6123746751\nCopy\n8836\nOld Town Road OOFED\n3180460921\nCopy\n8447\nWii Music\n1305251774\nCopy\n8364\nHow To Save A Life (Bass Boosted)\n727844285\nCopy\n8309\nDubstep Remix [26k+]\n130762736\nCopy\n8052\nEVERYBODY DO THE FLOP\n130778839\nCopy\n7962\nAnt, SeeDeng, Poke - PRESTONPLAYZ ROBLOX\n1096142805\nCopy\n7778\nYeah Yeah Yeahs - Heads Will Roll (JVH-C remix)\n290176752\nCopy\n7706\n♪ Nightcore - Light 'Em Up x Girl On Fire (S/V)\n587156015\nCopy\n7527\nDo the Harlem Shake!\n131154740\nCopy\n7314\nZero two but full song\n5060369688\nCopy\n7221\nInvinsible [NCS]\n6104227669\nCopy\n7011\nParty Music\n141820924\nCopy\n7009\n♫♫Ƴℴu'ѵҿ ßƏƏƝ ƮƦ☉ᏝᏝƎƊ♫♫\n142633540\nCopy\n6972\nRevenge (Minecraft Music)\n3807239428\nCopy\n6943\nOOF LASAGNA\n2866646141\nCopy\n6808\nAlbert Sings Despacito\n1398660411\nCopy\n6655\nDo A Barrel Roll!\n130791919\nCopy\n6647\nLadies And Gentlemen We Got Him\n2624663028\nCopy\n6642\nCreepy Music Box\n143382469\nCopy\n6516\nThe Roblox Song\n1784385682\nCopy\n6474\nZEROTWOOOOO with panda\n4459223174\nCopy\n6362\nsad violin\n135308045\nCopy\n6261\noofing in the 90's\n915288747\nCopy\n6092\nElevator Music\n130768299\nCopy\n5998\nFEED ME!\n130766856\nCopy\n5909\nTanqR Outro\n5812114304\nCopy\n5859\nMako - Beam (Proximity)\n165065112\nCopy\n5787"]
Need answer for two problems-
How should I get this into the Dataframe
How to get data from all 50 pages
You can simply use requests library for fetching the page and pandas for parsing the tables from the page. For fetching all the pages, you need to separately parse all the pages. The following code can parse all the pages into a single DataFrame:
import requests
import pandas as pd
def parse_tables(page_html):
page_tables = pd.read_html(page_html) # directly parse tables into pandas dataframe
column_names = page_tables[0].columns # save column names for later use
page_tables[0].columns = range(len(column_names))
# data is present in multiple <table> html tags but it looks like a single table, so combine data from all <table> tags
df = pd.concat(page_tables)
df.columns = column_names
return df.reset_index(drop=True)
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"}
base_url = "https://robloxsong.com/"
num_pages = 50 ##number of pages that you want to parse
ratings_tables = []
for page_num in range(1,num_pages):
page_url = base_url + "?page=" + str(page_num)
print("Parsing page " + str(page_num))
response = requests.get(url, headers=headers) # fetch the html page
if response.ok:
page_html = response.content
page_table = parse_tables(page_html)
ratings_tables.append(page_table)
else:
print("Unable to fetch page:", response.content)
final_ratings_table = pd.concat(ratings_tables).reset_index()
I'm having some problems extracting the table from this page: www.nasdaq.com/market-activity/stocks/aapl/dividend-history
My code is:
import requests
from bs4 import BeautifulSoup
headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
page = requests.get("https://www.nasdaq.com/market-activity/stocks/aapl/dividend-history", headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table', attrs={'class':'dividend-history__table'})
table_body = table.find('tbody', attrs={'class': 'dividend-history__table-body'})
data = []
rows = table_body.find('tr', attrs={'class': 'dividend-history__row dividend-history__row--data'})
for row in rows:
cols = row.find('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
They have the table in an iframe. I'm not sure what I'm doing wrong - it's the first time I use BeautifulSoup - I normally use requests + xPath for this sort of a thing.
What happens?
Table is not in an iframe as you guess.
Table data is generated dynamically so you could not grab it, with your approach
Everything would work fine till this lines, if content would be static.
Take a look find() only get the first occurrence of your filter - only one <tr> instead you should use find_all()
rows = table_body.find('tr', attrs={'class': 'dividend-history__row dividend-history__row--data'})
You try to loop a non iterable:
for row in rows:
Same thing with find() again:
cols = row.find('td')
And again iterating an non iterable:
cols = [ele.text.strip() for ele in cols]
What about an alternativ?
There are different approaches to scrape data from websites and you often came to limits with requests if they deal with dynamically data.
How about using websites api?
With requests and pandas it could be so simple:
import requests
import pandas as pd
url = "https://api.nasdaq.com/api/quote/AAPL/dividends?assetclass=stocks"
headers = {"user-agent": "Mozilla/5.0"}
r = requests.get(url, headers=headers)
df = pd.json_normalize(r.json()['data']['dividends']['rows'])
df
Output
exOrEffDate type amount declarationDate recordDate paymentDate
11/06/2020 CASH $0.205 10/29/2020 11/09/2020 11/12/2020
08/07/2020 CASH $0.82 07/30/2020 08/10/2020 08/13/2020
05/08/2020 CASH $0.82 04/30/2020 05/11/2020 05/14/2020
02/07/2020 CASH $0.77 01/28/2020 02/10/2020 02/13/2020
11/07/2019 CASH $0.77 10/30/2019 11/11/2019 11/14/2019
"Hello, i am quite new to web-scraping. I recently retrieved a list of web-links and there are URLs within these links containing data from tables. I am planning to scrape the data but can't seem to even get the URLs. Any form of help is much appreciated"
"The list of weblinks are
https://aviation-safety.net/database/dblist.php?Year=1919
https://aviation-safety.net/database/dblist.php?Year=1920
https://aviation-safety.net/database/dblist.php?Year=1921
https://aviation-safety.net/database/dblist.php?Year=1922
https://aviation-safety.net/database/dblist.php?Year=2019"
"From the list of links, i am planning to
a. get the URLs within these links
https://aviation-safety.net/database/record.php?id=19190802-0
https://aviation-safety.net/database/record.php?id=19190811-0
https://aviation-safety.net/database/record.php?id=19200223-0"
"b. get data from tables within each URL
(e.g., Incident date, incident time, type, operator, registration, msn, first flight, classification)"
#Get the list of weblinks
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
headers = {'insert user agent'}
#start of code
mainurl = "https://aviation-safety.net/database/"
def getAndParseURL(mainurl):
result = requests.get(mainurl)
soup = BeautifulSoup(result.content, 'html.parser')
datatable = soup.find_all('a', href = True)
return datatable
datatable = getAndParseURL(mainurl)
#go through the content and grab the URLs
links = []
for link in datatable:
if 'Year' in link['href']:
url = link['href']
links.append(mainurl + url)
#check if links are in dataframe
df = pd.DataFrame(links, columns=['url'])
df.head(10)
#save the links to a csv
df.to_csv('aviationsafetyyearlinks.csv')
#from the csv read each web-link and get URLs within each link
import csv
from urllib.request import urlopen
contents = []
df = pd.read_csv('aviationsafetyyearlinks.csv')
urls = df['url']
for url in urls:
contents.append(url)
for url in contents:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
addtable = soup.find_all('a', href = True)
"I am only able to get the list of web-links and am unable to get the URLs nor the data within these web-links. The code continually shows arrays
not really sure where my code is wrong, appreciate any help and many thanks in advance."
While requesting the page.Add User Agent.
headers = {'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
mainurl = "https://aviation-safety.net/database/dblist.php?Year=1919"
def getAndParseURL(mainurl):
result = requests.get(mainurl,headers=headers)
soup = BeautifulSoup(result.content, 'html.parser')
datatable = soup.select('a[href*="database/record"]')
return datatable
print(getAndParseURL(mainurl))
I am a beginner to Python Programming. I am practicing web scraping using bs4 module in python.
I have extracted some fields from a web page but it is extracting only 13 items whereas the web page has more than 13 items. I cannot understand why are the rest of the items not extracted.
Another thing is I want to extract the contact number and the email address of each item on the web page but they are available in the item's respective links. I am a beginner and frankly speaking I got stuck to how to access and scrape the link of each item's individual web page within a given web page. Kindly tell where am I doing wrong and if possible suggest what is to be done.
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
res = requests.post('https://www.nelsonalexander.com.au/real-estate-agents/?office=&agent=A')
soup = bs(res.content, 'lxml')
data = soup.find_all("div",{"class":"agent-card large large-3 medium-4 small-12 columns text-center end"})
records = []
for item in data:
name = item.find('h2').text.strip()
position = item.find('h3').text.strip()
records.append({'Names': name, 'Position': position})
df = pd.DataFrame(records,columns=['Names','Position'])
df=df.drop_duplicates()
df.to_excel(r'C:\Users\laptop\Desktop\NelsonAlexander.xls', sheet_name='MyData2', index = False, header=True)
I have done the above code for just extracting the names and the position of each items but it only scrapes 13 records but there are more records than it in the web page. I could not write any code for extracting the contact number and the email address of each record because it is present inside each item's individual page as I have got stuck.
The Excel sheet looks like this:
That website is loading the list dynamically as you scroll, however you can trace the ajax request, and parse data directly:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', 'Referer': 'https://www.nelsonalexander.com.au/real-estate-agents/?office=&agent=A'}
records = []
with requests.Session() as s:
for i in range(1, 22):
res = s.get(f'https://www.nelsonalexander.com.au/real-estate-agents/page/{i}/?ajax=1&agent=A', headers=headers)
soup = bs(res.content, 'lxml')
data = soup.find_all("div",{"class":"agent-card large large-3 medium-4 small-12 columns text-center end"})
for item in data:
name = item.find('h2').text.strip()
position = item.find('h3').text.strip()
phone = item.find("div",{"class":"small-6 columns text-left"}).find("a").get('href').replace("tel:", "")
records.append({'Names': name, 'Position': position, 'Phone': phone})
df = pd.DataFrame(records,columns=['Names','Position', 'Phone'])
df=df.drop_duplicates()
df.to_excel(r'C:\Users\laptop\Desktop\NelsonAlexander.xls', sheet_name='MyData2', index = False, header=True)
I am convinced that the emails are no where in the DOM. I made some modification to #drec4s code to instead go until there are no entries (dynamically).
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import itertools
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', 'Referer': 'https://www.nelsonalexander.com.au/real-estate-agents/?office=&agent=A'}
records = []
with requests.Session() as s:
for i in itertools.count():
res = s.get('https://www.nelsonalexander.com.au/real-estate-agents/page/{}/?ajax=1&agent=A'.format(i), headers=headers)
soup = bs(res.content, 'lxml')
data = soup.find_all("div",{"class":"agent-card large large-3 medium-4 small-12 columns text-center end"})
if(len(data) > 0):
for item in data:
name = item.find('h2').text.strip()
position = item.find('h3').text.strip()
phone = item.find("div",{"class":"small-6 columns text-left"}).find("a").get('href').replace("tel:", "")
records.append({'Names': name, 'Position': position, 'Phone': phone})
print({'Names': name, 'Position': position, 'Phone': phone})
else:
break