I have been looking to scrape a HTML table from an oil production SSRS feed online. I have managed to learn a bit of beautiful soup/python to get to the point I am currently but I think I need a little assistance to just get it finished.
The aim is to scrape the table which is all tagged and output json data. I have a json formatted output but for the 10 headers but it is repeating the same data row cell value per header. I think the iteration through the cells to assign to the headers is the issue. I'm sure it will make sense when run.
Any assistance would be greatly appreciated, trying to learn just what I have done wrong as this is pretty new to me.
Cheers
import json
from bs4 import BeautifulSoup
import urllib.request
import boto3
import botocore
#Url to scrape
url='http://factpages.npd.no/ReportServer?/FactPages/TableView/
field_production_monthly&rs:Command=Render&rc:Toolbar=
false&rc:Parameters=f&Top100=True&IpAddress=108.171.128.174&
CultureCode=en'
#Agent detail to prevent scraping bot detection
user_agent = 'Mozilla/5(Macintosh; Intel Mac OS X 10_9_3)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47
Safari/537.36'
header = {'User-Agent': user_agent}
#Request url from list above, assign headers from criteria above
req = urllib.request.Request(url, headers = header)
#Open url from the previous request and assign
npddata = urllib.request.urlopen(req, timeout = 20)
#Start soup on url request data
soup = BeautifulSoup(npddata, 'html.parser')
# Scrape the html table variable from selected website
table = soup.find('table')
headers = {}
col_headers = soup.findAll('tr')[3].findAll('td')
for i in range(len(col_headers)):
headers[i] = col_headers[i].text.strip()
# print(json.dumps(headers, indent = 4))
cells = {}
rows = soup.findAll('td', {
'class': ['a61cl', 'a65cr', 'a69cr', 'a73cr', 'a77cr', 'a81cr', 'a85cr',
'a89cr', 'a93cr', 'a97cr']})
for row in rows[i]: #remove index!(###ISSUE COULD BE HERE####)
# findall function was original try (replace getText with FindAll to try)
cells = row.getText('div')
# Attempt to fix, can remove and go back to above
#for i in range(len(rows)): #cells[i] = rows[i].text.strip()
#print(cells)# print(json.dumps(cells, indent = 4))
#print(cells)# print(json.dumps(cells, indent = 4))
data = []
item = {}
for index in headers:
item[headers[index]] = cells#[index]
# if no getText on line 47 then.text() here### ISSUE COULD BE HERE####
data.append(item)
#print(data)
print(json.dumps(data, indent = 4))
# print(item)#
print(json.dumps(item, indent = 4))
There were some errors in your code i fix those errors and modified your code a little :
Is this what you want :
import requests
from bs4 import BeautifulSoup
import json
# Webpage connection
html = "http://factpages.npd.no/ReportServer?/FactPages/TableView/field_production_monthly&rs:Command=Render&rc:Toolbar=false&rc:Parameters=f&Top100=True&IpAddress=108.171.128.174&CultureCode=en"
r=requests.get(html)
c=r.content
soup=BeautifulSoup(c,"html.parser")
rows = soup.findAll('td', {
'class': ['a61cl', 'a65cr', 'a69cr', 'a73cr', 'a77cr', 'a81cr', 'a85cr',
'a89cr', 'a93cr', 'a97cr']})
headers = soup.findAll('td', {
'class': ['a20c','a24c', 'a28c', 'a32c', 'a36c', 'a40c', 'a44c', 'a48c',
'a52c']})
headers_list = [item.getText('div') for item in headers]
rows_list=[item.getText('div') for item in rows]
final=[rows_list[item:item+9] for item in range(0,len(rows_list),9)]
row_header={}
for item in final:
for indices in range(0,9):
if headers_list[indices] not in row_header:
row_header[headers_list[indices]]=[item[indices]]
else:
row_header[headers_list[indices]].append(item[indices])
result=json.dumps(row_header,indent=4)
print(result)
the sample of output:
{
"Year": [
"2009",
"2009",
"2009",
"2009",
"2009",
"2009",
"2010",
"2010",
"2010",
"2010",
"2010",
Related
I am trying to scrape:
https://id.investing.com/commodities/gold-historical-data
table from 2010-2020, but the problem is the link between the default date and the date that I chose is still the same. So how can I tell python to scrape data from 2010-2020? please help me I'm using python 3.
This is my code:
import requests, bs4
url = 'https://id.investing.com/commodities/gold-historical-data'
headers = {"User-Agent":"Mozilla/5.0"}
response = requests.get(url, headers=headers)
soup = bs4.BeautifulSoup(response.text, 'lxml')
tables = soup.find_all('table')
print(soup)
with open('emasfile.csv','w') as csv:
for row in tables[1].find_all('tr'):
line = ""
for td in row.find_all(['td', 'th']):
line += '"' + td.text + '",'
csv.write(line + '\n')
This page uses JavaScript with AJAX to get data from
https://id.investing.com/instruments/HistoricalDataAjax
It sends POST requests with extra data - start date and end date ("st_date", "end_date")
You can try to use 01/01/2010, 12/31/2020 but I used for-loop to get every year separatelly.
I get all information from DevTool (tab 'Network') in Chrome/Firefox.
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://id.investing.com/instruments/HistoricalDataAjax'
payload = {
"curr_id": "8830",
"smlID": "300004",
"header": "Data+Historis+Emas+Berjangka",
"st_date": "01/30/2020",
"end_date": "12/31/2020",
"interval_sec": "Daily",
"sort_col": "date",
"sort_ord": "DESC",
"action":"historical_data"
}
headers = {
#"Referer": "https://id.investing.com/commodities/gold-historical-data",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0",
"X-Requested-With": "XMLHttpRequest"
}
fh = open('output.csv', 'w')
csv_writer = csv.writer(fh)
for year in range(2010, 2021):
print('year:', year)
payload["st_date"] = f"01/01/{year}"
payload["end_date"] = f"12/31/{year}"
r = requests.post(url, data=payload, headers=headers)
#print(r.text)
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find('table')
for row in table.find_all('tr')[1:]: # [1:] to skip header
row_data = [item.text for item in row.find_all('td')]
print(row_data)
csv_writer.writerow(row_data)
fh.close()
"Hello, i am quite new to web-scraping. I recently retrieved a list of web-links and there are URLs within these links containing data from tables. I am planning to scrape the data but can't seem to even get the URLs. Any form of help is much appreciated"
"The list of weblinks are
https://aviation-safety.net/database/dblist.php?Year=1919
https://aviation-safety.net/database/dblist.php?Year=1920
https://aviation-safety.net/database/dblist.php?Year=1921
https://aviation-safety.net/database/dblist.php?Year=1922
https://aviation-safety.net/database/dblist.php?Year=2019"
"From the list of links, i am planning to
a. get the URLs within these links
https://aviation-safety.net/database/record.php?id=19190802-0
https://aviation-safety.net/database/record.php?id=19190811-0
https://aviation-safety.net/database/record.php?id=19200223-0"
"b. get data from tables within each URL
(e.g., Incident date, incident time, type, operator, registration, msn, first flight, classification)"
#Get the list of weblinks
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
headers = {'insert user agent'}
#start of code
mainurl = "https://aviation-safety.net/database/"
def getAndParseURL(mainurl):
result = requests.get(mainurl)
soup = BeautifulSoup(result.content, 'html.parser')
datatable = soup.find_all('a', href = True)
return datatable
datatable = getAndParseURL(mainurl)
#go through the content and grab the URLs
links = []
for link in datatable:
if 'Year' in link['href']:
url = link['href']
links.append(mainurl + url)
#check if links are in dataframe
df = pd.DataFrame(links, columns=['url'])
df.head(10)
#save the links to a csv
df.to_csv('aviationsafetyyearlinks.csv')
#from the csv read each web-link and get URLs within each link
import csv
from urllib.request import urlopen
contents = []
df = pd.read_csv('aviationsafetyyearlinks.csv')
urls = df['url']
for url in urls:
contents.append(url)
for url in contents:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
addtable = soup.find_all('a', href = True)
"I am only able to get the list of web-links and am unable to get the URLs nor the data within these web-links. The code continually shows arrays
not really sure where my code is wrong, appreciate any help and many thanks in advance."
While requesting the page.Add User Agent.
headers = {'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
mainurl = "https://aviation-safety.net/database/dblist.php?Year=1919"
def getAndParseURL(mainurl):
result = requests.get(mainurl,headers=headers)
soup = BeautifulSoup(result.content, 'html.parser')
datatable = soup.select('a[href*="database/record"]')
return datatable
print(getAndParseURL(mainurl))
I'm trying to scrape website from a job postings data, and the output looks like this:
[{'job_title': 'Junior Data Scientist','company': '\n\n BBC',
summary': "\n We're now seeking a Junior Data Scientist to
come and work with our Marketing & Audiences team in London. The Data
Science team are responsible for designing...", 'link':
'www.jobsite.com',
'summary_text': "Job Introduction\nImagine if Netflix, The Huffington Post, ESPN, and Spotify were all rolled into one....etc
I want to create a dataframe, or a CSV, that looks like this:
right now, this is the loop I'm using:
for page in pages:
source = requests.get('https://www.jobsite.co.uk/jobs?q=data+scientist&start='.format()).text
soup = BeautifulSoup(source, 'lxml')
results = []
for jobs in soup.findAll(class_='result'):
result = {
'job_title': '',
'company': '',
'summary': '',
'link': '',
'summary_text': ''
}
and after using the loop, I just print the results.
What would be a good way to get the output in a dataframe? Thanks!
Look at the pandas Dataframe API. There are several ways you can initialize a dataframe
list of dictionaries
list of lists
You just need to append either a list or a dictionary to a global variable, and you should be good to go.
results = []
for page in pages:
source = requests.get('https://www.jobsite.co.uk/jobs?q=data+scientist&start='.format()).text
soup = BeautifulSoup(source, 'lxml')
for jobs in soup.findAll(class_='result'):
result = {
'job_title': '', # assuming this has value like you shared in the example in your question
'company': '',
'summary': '',
'link': '',
'summary_text': ''
}
results.append(result)
# results is now a list of dictionaries
df= pandas.DataFrame(results)
One other suggestion, don't think about dumping this in a dataframe within the same program. Dump all your HTML files first into a folder, and then parse them again. This way if you need more information from the page which you hadn't considered before, or if a program terminates due to some parsing error or timeout, the work is not lost. Keep parsing separate from crawling logic.
I think you need to define the number of pages and add that into your url (ensure you have a placeholder for the value which I don't think your code, nor the other answer have). I have done this via extending your url to include a page parameter in the querystring which incorporates a placeholder.
Is your selector of class result correct? You could certainly also use for job in soup.select('.job'):. You then need to define appropriate selectors to populate values. I think it easier to grab all the job links for each page then visit the page and extract the values from a json like string in the page. Add Session to re-use connection.
Explicit waits required to prevent being blocked
import requests
from bs4 import BeautifulSoup as bs
import json
import pandas as pd
import time
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
results = []
links = []
pages = 3
with requests.Session() as s:
for page in range(1, pages + 1):
try:
url = 'https://www.jobsite.co.uk/jobs?q=data+scientist&start=1&page={}'.format(page)
source = s.get(url, headers = headers).text
soup = bs(source, 'lxml')
links.append([link['href'] for link in soup.select('.job-title a')])
except Exception as e:
print(e, url )
finally:
time.sleep(2)
final_list = [item for sublist in links for item in sublist]
for link in final_list:
source = s.get(link, headers = headers).text
soup = bs(source, 'lxml')
data = soup.select_one('#jobPostingSchema').text #json like string containing all info
item = json.loads(data)
result = {
'Title' : item['title'],
'Company' : item['hiringOrganization']['name'],
'Url' : link,
'Summary' :bs(item['description'],'lxml').text
}
results.append(result)
time.sleep(1)
df = pd.DataFrame(results, columns = ['Title', 'Company', 'Url', 'Summary'])
print(df)
df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8-sig',index = False )
Sample of results:
I can't imagine you want all pages but you could use something similar to:
import requests
from bs4 import BeautifulSoup as bs
import json
import pandas as pd
import time
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
results = []
links = []
pages = 0
def get_links(url, page):
try:
source = s.get(url, headers = headers).text
soup = bs(source, 'lxml')
page_links = [link['href'] for link in soup.select('.job-title a')]
if page == 1:
global pages
pages = int(soup.select_one('.page-title span').text.replace(',',''))
except Exception as e:
print(e, url )
finally:
time.sleep(1)
return page_links
with requests.Session() as s:
links.append(get_links('https://www.jobsite.co.uk/jobs?q=data+scientist&start=1&page=1',1))
for page in range(2, pages + 1):
url = 'https://www.jobsite.co.uk/jobs?q=data+scientist&start=1&page={}'.format(page)
links.append(get_links(url, page))
final_list = [item for sublist in links for item in sublist]
for link in final_list:
source = s.get(link, headers = headers).text
soup = bs(source, 'lxml')
data = soup.select_one('#jobPostingSchema').text #json like string containing all info
item = json.loads(data)
result = {
'Title' : item['title'],
'Company' : item['hiringOrganization']['name'],
'Url' : link,
'Summary' :bs(item['description'],'lxml').text
}
results.append(result)
time.sleep(1)
df = pd.DataFrame(results, columns = ['Title', 'Company', 'Url', 'Summary'])
print(df)
df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8-sig',index = False )
I am trying to exatract some numbers from a graph on this page ( https://www.zoopla.co.uk/local-info/?outcode=cm15&incode=9bq )
There are 5 tabs in that graph.
I am interested in the 5th tab (Newspapers).
When I run this piece of code, I get some info about the first tabbed graph.
but soup.find_all('',id='neighbours-newspapers') returns a blank.
from bs4 import BeautifulSoup as bs
import requests
res=requests.get('https://www.zoopla.co.uk/local-info/?outcode=cm15&incode=9bq')
soup = bs(res.content, 'lxml')
housing = [item.text.replace('\n','').strip() for item in soup.find_all('',id='local-info-neighbours')]
print(housing)
newspapers = [item.text.replace('\n','').strip() for item in soup.find_all('',id='neighbours-newspapers')]
print(newspapers)
I am not sure how to access an id within an id if that's what it is. Could someone help please?
You can use regex and requests
import requests
import re
import ast
headers = {
'Referer' : 'https://www.zoopla.co.uk/',
'User-Agent' : 'Mozilla/5.0'
}
res = requests.get('https://www.zoopla.co.uk/widgets/local-info/neighbours-chart.html?outcode=cm15&incode=9bq&category=Newspapers', headers = headers)
data = re.search(r'categories: (\[.*])', res.text ,flags=re.DOTALL).group(1)
items = re.findall(r'(\[.*])', data)
papers = ast.literal_eval(items[0])
numbers = ast.literal_eval(items[1])
result = list(zip(papers, numbers))
print(result)
I already done scraping of wikipedia's infobox but I don't know how to store taht data in csv file. Please help me out.
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
def infobox(query) :
query = query
url = 'https://en.wikipedia.org/wiki/'+query
raw = urlopen(url)
soup = bs(raw)
table = soup.find('table',{'class':'infobox vcard'})
for tr in table.find_all('tr') :
print(tr.text)
infobox('Infosys')
You have to collect the required data and write in csv file, you can use csv module see below example:
from bs4 import BeautifulSoup as bs
from urllib import urlopen
import csv
def infobox(query) :
query = query
content_list = []
url = 'https://en.wikipedia.org/wiki/'+query
raw = urlopen(url)
soup = bs(raw)
table = soup.find('table',{'class':'infobox vcard'})
for tr in table.find_all('tr') :
if len(tr.contents) > 1:
content_list.append([tr.contents[0].text.encode('utf-8'), tr.contents[1].text.encode('utf-8')])
elif tr.text:
content_list.append([tr.text.encode('utf-8')])
write_csv_file(content_list)
def write_csv_file(content_list):
with open(r'd:\Test.csv', mode='wb') as csv_file:
writer = csv.writer(csv_file, delimiter=',')
writer.writerows(content_list)
infobox('Infosys')
Here is an outline of how you can test whether the row has a header and a table cell element within it to ensure two columns (you can expand to write td only rows to populate perhaps the first column within the if structure). I use slightly different encoding syntax for cleaner output, select for faster element selection than find and utilize pandas to generate the csv.
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
url = 'https://en.wikipedia.org/wiki/'+ 'Infosys'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 'Referer': 'https://www.nseindia.com/'}
r = requests.get(url, headers=headers)
soup = bs(r.content,'lxml')
table =soup.select_one('.infobox.vcard')
rows = table.find_all('tr')
output = []
for row in rows:
if len(row.select('th, td')) == 2:
outputRow = [row.select_one('th').text, row.select_one('td').text, [item['href'] for item in row.select('td a')] if row.select_one('td a') is not None else '']
outputRow[2] = ['https://en.wikipedia.org/wiki/Infosys' + item if item[0] == '#' else 'https://en.wikipedia.org' + item for item in outputRow[2]]
output.append(outputRow)
df = pd.DataFrame(output)
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )