As a part of a a bigger webscrapping project, I want to extract the html link from a html. It is not all html link on the page, but only the in the second column of the big table.
An example of how the html these links appear look like:
18955/989/20
I would like to have a list the "exibir?proc=18955/989/20&offset=0" and NOT the "18955/989/20".
So far, i could only get them both together (code bellow). How can I get rid of it? Is there another solution? At the end I would like to have only a list of the links in the order they already appear.
from requests_html import HTMLSession
import csv
s = HTMLSession()
def get_product_links(page):
url = f"https://www.tce.sp.gov.br/jurisprudencia/pesquisar?txtTdPalvs=munic%C3%ADpio+pessoal+37&txtExp=temporari&txtQqUma=admiss%C3%A3o+contrata%C3%A7%C3%A3o&txtNenhPalvs=&txtNumIni=&txtNumFim=&tipoBuscaTxt=Documento&_tipoBuscaTxt=on&quantTrechos=1&processo=&exercicio=&dataAutuacaoInicio=&dataAutuacaoFim=&dataPubInicio=01%2F01%2F2021&dataPubFim=31%2F12%2F2021&_relator=1&_auditor=1&_materia=1&tipoDocumento=2&_tipoDocumento=1&acao=Executa&offset={page}"
links = []
r = s.get(url)
products = r.html.find('td.small a')
for item in products:
links.append(item.find('a', first=True).attrs['href'])
return links
page1 = get_product_links(0)
print(page1)
Here is one way to get those links from the second column. You're welcome to functionalize it if you want.
from bs4 import BeautifulSoup as bs
import requests
from tqdm import tqdm ## if using Jupyter: from tqdm.notebook import tqdm
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(0, 410, 10)):
url = f'https://www.tce.sp.gov.br/jurisprudencia/pesquisar?txtTdPalvs=munic%C3%ADpio+pessoal+37&txtExp=temporari&txtQqUma=admiss%C3%A3o+contrata%C3%A7%C3%A3o&txtNenhPalvs=&txtNumIni=&txtNumFim=&tipoBuscaTxt=Documento&_tipoBuscaTxt=on&quantTrechos=1&processo=&exercicio=&dataAutuacaoInicio=&dataAutuacaoFim=&dataPubInicio=01%2F01%2F2021&dataPubFim=31%2F12%2F2021&_relator=1&_auditor=1&_materia=1&tipoDocumento=2&_tipoDocumento=1&acao=Executa&offset={x}'
r = s.get(url)
urls = bs(r.text, 'html.parser').select('tr[class="borda-superior"] td:nth-of-type(2) a')
big_list.extend([(x.text.strip(), 'https://www.tce.sp.gov.br/jurisprudencia/' + x.get('href')) for x in urls])
df = pd.DataFrame(big_list, columns = ['title', 'url'])
print(df)
Result in terminal:
100%
41/41 [00:30<00:00, 1.42it/s]
title url
0 18955/989/20 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=18955/989/20&offset=0
1 13614/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=13614/989/18&offset=0
2 6269/989/19 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=6269/989/19&offset=0
3 14011/989/19 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=14011/989/19&offset=0
4 14082/989/19 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=14082/989/19&offset=0
... ... ...
399 4023/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=4023/989/18&offset=390
400 4024/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=4024/989/18&offset=400
401 4025/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=4025/989/18&offset=400
402 4026/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=4026/989/18&offset=400
403 4027/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=4027/989/18&offset=400
404 rows × 2 columns
Edit: If you want only the (partial) url as a list, all you have to do is:
from bs4 import BeautifulSoup as bs
import requests
from tqdm import tqdm ## if Jupyter: from tqdm.notebook import tqdm
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(0, 410, 10)):
url = f'https://www.tce.sp.gov.br/jurisprudencia/pesquisar?txtTdPalvs=munic%C3%ADpio+pessoal+37&txtExp=temporari&txtQqUma=admiss%C3%A3o+contrata%C3%A7%C3%A3o&txtNenhPalvs=&txtNumIni=&txtNumFim=&tipoBuscaTxt=Documento&_tipoBuscaTxt=on&quantTrechos=1&processo=&exercicio=&dataAutuacaoInicio=&dataAutuacaoFim=&dataPubInicio=01%2F01%2F2021&dataPubFim=31%2F12%2F2021&_relator=1&_auditor=1&_materia=1&tipoDocumento=2&_tipoDocumento=1&acao=Executa&offset={x}'
r = s.get(url)
urls = bs(r.text, 'html.parser').select('tr[class="borda-superior"] td:nth-of-type(2) a')
big_list.extend([x.get('href') for x in urls])
print(big_list)
Result in terminal:
100%
41/41 [00:29<00:00, 1.83it/s]
['exibir?proc=18955/989/20&offset=0',
'exibir?proc=13614/989/18&offset=0',
'exibir?proc=6269/989/19&offset=0',
'exibir?proc=14011/989/19&offset=0',
'exibir?proc=14082/989/19&offset=0',
'exibir?proc=14238/989/19&offset=0',
'exibir?proc=14141/989/20&offset=0',
'exibir?proc=15371/989/19&offset=0',
'exibir?proc=15388/989/20&offset=0',
'exibir?proc=12911/989/16&offset=0',
'exibir?proc=1735/002/11&offset=10',
'exibir?proc=23494/989/18&offset=10',
'exibir?proc=24496/989/19&offset=10',
'exibir?proc=17110/989/18&offset=10',
'exibir?proc=24043/989/19&offset=10',
'exibir?proc=2515/989/20&offset=10',
'exibir?proc=1891/989/20&offset=10',
'exibir?proc=15026/989/20&offset=10',
'exibir?proc=9070/989/20&offset=10',
'exibir?proc=21543/989/20&offset=10',
'exibir?proc=19654/989/20&offset=20',
'exibir?proc=19678/989/20&offset=20',
'exibir?proc=5714/989/19&offset=20',
'exibir?proc=20493/989/20&offset=20',
'exibir?proc=4671/989/19&offset=20',
'exibir?proc=14200/989/20&offset=20',
'exibir?proc=15277/989/20&offset=20',
'exibir?proc=1363/007/12&offset=20',
'exibir?proc=4908/989/19&offset=20',
'exibir?proc=15164/989/20&offset=20',
'exibir?proc=4418/989/19&offset=30',
'exibir?proc=4890/989/19&offset=30',
'exibir?proc=17924/989/20&offset=30',
'exibir?proc=4742/989/19&offset=30',
'exibir?proc=800226/465/09&offset=30',
'exibir?proc=23880/989/20&offset=30',
'exibir?proc=4561/989/19&offset=30',
'exibir?proc=4540/989/19&offset=30',
'exibir?proc=4471/989/19&offset=30',
'exibir?proc=4982/989/19&offset=30',
'exibir?proc=4519/989/19&offset=40',
'exibir?proc=4632/989/19&offset=40',
'exibir?proc=4536/989/19&offset=40',
'exibir?proc=4622/989/19&offset=40',
'exibir?proc=14734/989/16&offset=40',
'exibir?proc=4678/989/19&offset=40',
'exibir?proc=5501/989/16&offset=40',
'exibir?proc=13988/989/17&offset=40',
'exibir?proc=4854/989/19&offset=40',
'exibir?proc=4609/989/19&offset=40',
'exibir?proc=4717/989/19&offset=50',
'exibir?proc=4673/989/19&offset=50',
'exibir?proc=20988/989/20&offset=50',
'exibir?proc=4481/989/19&offset=50',
'exibir?proc=4675/989/19&offset=50',
'exibir?proc=4451/989/19&offset=50',
'exibir?proc=12943/989/19&offset=50',
'exibir?proc=23644/989/18&offset=50',
'exibir?proc=23875/989/18&offset=50',
'exibir?proc=4679/989/19&offset=50',
'exibir?proc=4425/989/19&offset=60',
'exibir?proc=2726/989/18&offset=60',
'exibir?proc=17172/989/20&offset=60',
'exibir?proc=2901/989/18&offset=60',
'exibir?proc=4469/989/19&offset=60',
'exibir?proc=299/012/13&offset=60',
'exibir?proc=4915/989/19&offset=60',
'exibir?proc=22649/989/20&offset=60',
'exibir?proc=22887/989/20&offset=60',
'exibir?proc=4721/989/19&offset=60',
'exibir?proc=4378/989/19&offset=70',
'exibir?proc=4935/989/19&offset=70',
'exibir?proc=4714/989/19&offset=70',
'exibir?proc=1230/989/21&offset=70',
'exibir?proc=1847/989/21&offset=70',
'exibir?proc=15606/989/21&offset=70',
'exibir?proc=11267/989/18&offset=70',
'exibir?proc=1232/004/12&offset=70',
'exibir?proc=4421/989/19&offset=70',
'exibir?proc=4931/989/19&offset=70',
'exibir?proc=4885/989/19&offset=80',
'exibir?proc=5002/989/19&offset=80',
'exibir?proc=21592/989/20&offset=80',
'exibir?proc=4839/989/19&offset=80',
'exibir?proc=4783/989/19&offset=80',
'exibir?proc=4599/989/19&offset=80',
'exibir?proc=4702/989/19&offset=80',
'exibir?proc=4617/989/19&offset=80',
'exibir?proc=4970/989/16&offset=80',
'exibir?proc=4492/989/19&offset=80',
'exibir?proc=2582/989/17&offset=90',
'exibir?proc=4993/989/19&offset=90',
'exibir?proc=4658/989/19&offset=90',
'exibir?proc=4606/989/19&offset=90',
'exibir?proc=4387/989/19&offset=90',
'exibir?proc=14549/989/19&offset=90',
'exibir?proc=4525/989/18&offset=90',
'exibir?proc=4713/989/19&offset=90',
'exibir?proc=838/001/14&offset=90',
'exibir?proc=4971/989/19&offset=90',
'exibir?proc=17505/989/18&offset=100',
'exibir?proc=5096/989/18&offset=100',
'exibir?proc=4413/989/19&offset=100',
'exibir?proc=4392/989/19&offset=100',
'exibir?proc=15132/989/20&offset=100',
'exibir?proc=4517/989/19&offset=100',
'exibir?proc=4760/989/19&offset=100',
'exibir?proc=18509/989/19&offset=100',
'exibir?proc=4952/989/19&offset=100',
'exibir?proc=5013/989/19&offset=100',
'exibir?proc=12922/989/19&offset=110',
'exibir?proc=6194/989/16&offset=110',
'exibir?proc=19323/989/20&offset=110',
'exibir?proc=4732/989/19&offset=110',
...]
https://hk.centanet.com/findproperty/en/list/transaction?gclid=Cj0KCQjwnbmaBhD-ARIsAGTPcfVae1prjf_9aKh0dbnaBbzYvi3VhKn4qEXDAQJMS6ZvOiet8GLqzaAaAqH_EALw_wcB&q=3qoOuFNgwUeioKQCtZ9KFA
I'm trying to web scrape a table with Python and I want to scrape all data in 417 pages in this website. Since the web_url does not change when I click next page, I cannot just iterate over different url with ?page=[1,2,3,4] etc...
The desire outcome is to get all data from all 417 pages, since all 417 pages may be a lot, to simplify let's say getting data for more than just data from page 1, ex: both first and second pages..
I've tried both methods
(1) scraping from page elements with Beautiful Soup & Requests and
(2) directly from the webpage api_url found in Network
but both of the methods failed to find data in all pages ... I'm not sure which part I'm doing it wrong, my guess was the url has some JS controlling which page's data was sent to the api but I'm not sure what to do to simulate this process :(
Below are codes for above method
Method 1
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
import json
web_url = 'https://hk.centanet.com/findproperty/en/list/transaction?gclid=Cj0KCQjwnbmaBhD-ARIsAGTPcfVae1prjf_9aKh0dbnaBbzYvi3VhKn4qEXDAQJMS6ZvOiet8GLqzaAaAqH_EALw_wcB&q=3qoOuFNgwUeioKQCtZ9KFA'
data = {"name": "Value"}
r = requests.get(
web_url
)
print(f'status: {r.status_code}')
if r.status_code == 200: raw_data = BeautifulSoup( r.content , 'html.parser' )
table_content = raw_data.find_all( 'div' , {'class':'cv-structured-list-item cv-structured-list-item--standard bx--structured-list-row'} )
print(len(table_content))
Method 2
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
import json
web_url = 'https://hk.centanet.com/findproperty/api/Transaction/Search'
_header = {'lang':'en',"postType":"Both","day":"Day1095","sort":"InsOrRegDate","order":"Descending","size":'24',"offset":'24',"pageSource":"search","gclid":"Cj0KCQjwnbmaBhD-ARIsAGTPcfVae1prjf_9aKh0dbnaBbzYvi3VhKn4qEXDAQJMS6ZvOiet8GLqzaAaAqH_EALw_wcB","q":"3qoOuFNgwUeioKQCtZ9KFA"}
_data = {"name":"Value"}
req = requests.post( web_url , headers =_header , json = _data)
print(f'Status Code: {req.status_code}')
if req.status_code==200: data = json.loads(req.content)['data']
print( len(data) )
print(data[0].get('displayText').get('addr'))
You can make the pagination from payload data following the offset key as data is loaded from API as POST method as json format.
Example:
import requests
import pandas as pd
api_url= 'https://hk.centanet.com/findproperty/api/Transaction/Search'
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'Content-Type': 'application/json'
}
payload= {"postType":"Both","day":"Day1095","sort":"InsOrRegDate","order":"Descending","size":24,"offset":"24","pageSource":"search","gclid":"Cj0KCQjwnbmaBhD-ARIsAGTPcfVae1prjf_9aKh0dbnaBbzYvi3VhKn4qEXDAQJMS6ZvOiet8GLqzaAaAqH_EALw_wcB","q":"3qoOuFNgwUeioKQCtZ9KFA"}
lst = []
for p in range(0,240,24):
payload['offset'] = p
res=requests.post(api_url,headers=headers,json=payload)
for item in res.json()['data']:
d = {
'flat':item['xAxis']
}
lst.append(d)
df = pd.DataFrame(lst)
print(df)
Output:
flat
0 A室
1 B室
2 B室
3 2室
4 C室
.. ...
235 E室
236 F室
237 E室
238 8室
239 6室
[240 rows x 1 columns]
I would like to obtain a table that appears on a URL: https://www.coronavirus.vic.gov.au/exposure-sites
When right-clicking and inspecting the element, it is evident there is a table element with a class that can be referenced. However, when requested this does not appear.
Reproducible example:
import pandas as pd
import requests
from bs4 import BeautifulSoup
header = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
link = 'https://www.coronavirus.vic.gov.au/exposure-sites'
r = requests.get(link, headers=header)
soup = BeautifulSoup(r.text, "html5lib")
htmltable = soup.find('table', { 'class' : "rpl-row rpl-search-results-layout__main rpl-row--gutter" })
# Error Appears here because the above doesn't exist even though it should?
print(htmltable)
def tableDataText(table):
"""Parses a html segment started with tag <table> followed
by multiple <tr> (table rows) and inner <td> (table data) tags.
It returns a list of rows with inner columns.
Accepts only one <th> (table header/data) in the first row.
"""
def rowgetDataText(tr, coltag='td'): # td (data) or th (header)
return [td.get_text(strip=True) for td in tr.find_all(coltag)]
rows = []
trs = table.find_all('tr')
headerow = rowgetDataText(trs[0], 'th')
if headerow: # if there is a header row include first
rows.append(headerow)
trs = trs[1:]
for tr in trs: # for every table row
rows.append(rowgetDataText(tr, 'td') ) # data row
return rows
list_table = tableDataText(htmltable)
df = pd.DataFrame(list_table[1:], columns=list_table[0])
df
The end state should be a table which is a collection of the 18 pages of tables on the webpage.
You can make call to this URL to get data as json format which returns list of dictionary data and loop over it data can be extraced from using key associated to it
import requests
from bs4 import BeautifulSoup
res=requests.get(" https://www.coronavirus.vic.gov.au/sdp-ckan?resource_id=afb52611-6061-4a2b-9110-74c920bede77&limit=10000")
data=res.json()
main_data=data['result']['records']
for i in range(len(main_data)):
print(main_data[i]['Suburb'])
print(main_data[i]['Site_title'])
Output:
Newport
TyrePlus Newport
Newport
TyrePlus Newport
Newport
...
How to find URL go to Chrome Developer mode and Network tab refresh your site and find data from image (lef hand side) and from preview you will get to kown about URL
Image:
For Dataframe:
import pandas as pd
df=pd.DataFrame(main_data)
"Hello, i am quite new to web-scraping. I recently retrieved a list of web-links and there are URLs within these links containing data from tables. I am planning to scrape the data but can't seem to even get the URLs. Any form of help is much appreciated"
"The list of weblinks are
https://aviation-safety.net/database/dblist.php?Year=1919
https://aviation-safety.net/database/dblist.php?Year=1920
https://aviation-safety.net/database/dblist.php?Year=1921
https://aviation-safety.net/database/dblist.php?Year=1922
https://aviation-safety.net/database/dblist.php?Year=2019"
"From the list of links, i am planning to
a. get the URLs within these links
https://aviation-safety.net/database/record.php?id=19190802-0
https://aviation-safety.net/database/record.php?id=19190811-0
https://aviation-safety.net/database/record.php?id=19200223-0"
"b. get data from tables within each URL
(e.g., Incident date, incident time, type, operator, registration, msn, first flight, classification)"
#Get the list of weblinks
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
headers = {'insert user agent'}
#start of code
mainurl = "https://aviation-safety.net/database/"
def getAndParseURL(mainurl):
result = requests.get(mainurl)
soup = BeautifulSoup(result.content, 'html.parser')
datatable = soup.find_all('a', href = True)
return datatable
datatable = getAndParseURL(mainurl)
#go through the content and grab the URLs
links = []
for link in datatable:
if 'Year' in link['href']:
url = link['href']
links.append(mainurl + url)
#check if links are in dataframe
df = pd.DataFrame(links, columns=['url'])
df.head(10)
#save the links to a csv
df.to_csv('aviationsafetyyearlinks.csv')
#from the csv read each web-link and get URLs within each link
import csv
from urllib.request import urlopen
contents = []
df = pd.read_csv('aviationsafetyyearlinks.csv')
urls = df['url']
for url in urls:
contents.append(url)
for url in contents:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
addtable = soup.find_all('a', href = True)
"I am only able to get the list of web-links and am unable to get the URLs nor the data within these web-links. The code continually shows arrays
not really sure where my code is wrong, appreciate any help and many thanks in advance."
While requesting the page.Add User Agent.
headers = {'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
mainurl = "https://aviation-safety.net/database/dblist.php?Year=1919"
def getAndParseURL(mainurl):
result = requests.get(mainurl,headers=headers)
soup = BeautifulSoup(result.content, 'html.parser')
datatable = soup.select('a[href*="database/record"]')
return datatable
print(getAndParseURL(mainurl))
I already done scraping of wikipedia's infobox but I don't know how to store taht data in csv file. Please help me out.
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
def infobox(query) :
query = query
url = 'https://en.wikipedia.org/wiki/'+query
raw = urlopen(url)
soup = bs(raw)
table = soup.find('table',{'class':'infobox vcard'})
for tr in table.find_all('tr') :
print(tr.text)
infobox('Infosys')
You have to collect the required data and write in csv file, you can use csv module see below example:
from bs4 import BeautifulSoup as bs
from urllib import urlopen
import csv
def infobox(query) :
query = query
content_list = []
url = 'https://en.wikipedia.org/wiki/'+query
raw = urlopen(url)
soup = bs(raw)
table = soup.find('table',{'class':'infobox vcard'})
for tr in table.find_all('tr') :
if len(tr.contents) > 1:
content_list.append([tr.contents[0].text.encode('utf-8'), tr.contents[1].text.encode('utf-8')])
elif tr.text:
content_list.append([tr.text.encode('utf-8')])
write_csv_file(content_list)
def write_csv_file(content_list):
with open(r'd:\Test.csv', mode='wb') as csv_file:
writer = csv.writer(csv_file, delimiter=',')
writer.writerows(content_list)
infobox('Infosys')
Here is an outline of how you can test whether the row has a header and a table cell element within it to ensure two columns (you can expand to write td only rows to populate perhaps the first column within the if structure). I use slightly different encoding syntax for cleaner output, select for faster element selection than find and utilize pandas to generate the csv.
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
url = 'https://en.wikipedia.org/wiki/'+ 'Infosys'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 'Referer': 'https://www.nseindia.com/'}
r = requests.get(url, headers=headers)
soup = bs(r.content,'lxml')
table =soup.select_one('.infobox.vcard')
rows = table.find_all('tr')
output = []
for row in rows:
if len(row.select('th, td')) == 2:
outputRow = [row.select_one('th').text, row.select_one('td').text, [item['href'] for item in row.select('td a')] if row.select_one('td a') is not None else '']
outputRow[2] = ['https://en.wikipedia.org/wiki/Infosys' + item if item[0] == '#' else 'https://en.wikipedia.org' + item for item in outputRow[2]]
output.append(outputRow)
df = pd.DataFrame(output)
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )