How to scrape multiple pages with the same URL - Python 3 - python

https://hk.centanet.com/findproperty/en/list/transaction?gclid=Cj0KCQjwnbmaBhD-ARIsAGTPcfVae1prjf_9aKh0dbnaBbzYvi3VhKn4qEXDAQJMS6ZvOiet8GLqzaAaAqH_EALw_wcB&q=3qoOuFNgwUeioKQCtZ9KFA
I'm trying to web scrape a table with Python and I want to scrape all data in 417 pages in this website. Since the web_url does not change when I click next page, I cannot just iterate over different url with ?page=[1,2,3,4] etc...
The desire outcome is to get all data from all 417 pages, since all 417 pages may be a lot, to simplify let's say getting data for more than just data from page 1, ex: both first and second pages..
I've tried both methods
(1) scraping from page elements with Beautiful Soup & Requests and
(2) directly from the webpage api_url found in Network
but both of the methods failed to find data in all pages ... I'm not sure which part I'm doing it wrong, my guess was the url has some JS controlling which page's data was sent to the api but I'm not sure what to do to simulate this process :(
Below are codes for above method
Method 1
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
import json
web_url = 'https://hk.centanet.com/findproperty/en/list/transaction?gclid=Cj0KCQjwnbmaBhD-ARIsAGTPcfVae1prjf_9aKh0dbnaBbzYvi3VhKn4qEXDAQJMS6ZvOiet8GLqzaAaAqH_EALw_wcB&q=3qoOuFNgwUeioKQCtZ9KFA'
data = {"name": "Value"}
r = requests.get(
web_url
)
print(f'status: {r.status_code}')
if r.status_code == 200: raw_data = BeautifulSoup( r.content , 'html.parser' )
table_content = raw_data.find_all( 'div' , {'class':'cv-structured-list-item cv-structured-list-item--standard bx--structured-list-row'} )
print(len(table_content))
Method 2
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
import json
web_url = 'https://hk.centanet.com/findproperty/api/Transaction/Search'
_header = {'lang':'en',"postType":"Both","day":"Day1095","sort":"InsOrRegDate","order":"Descending","size":'24',"offset":'24',"pageSource":"search","gclid":"Cj0KCQjwnbmaBhD-ARIsAGTPcfVae1prjf_9aKh0dbnaBbzYvi3VhKn4qEXDAQJMS6ZvOiet8GLqzaAaAqH_EALw_wcB","q":"3qoOuFNgwUeioKQCtZ9KFA"}
_data = {"name":"Value"}
req = requests.post( web_url , headers =_header , json = _data)
print(f'Status Code: {req.status_code}')
if req.status_code==200: data = json.loads(req.content)['data']
print( len(data) )
print(data[0].get('displayText').get('addr'))

You can make the pagination from payload data following the offset key as data is loaded from API as POST method as json format.
Example:
import requests
import pandas as pd
api_url= 'https://hk.centanet.com/findproperty/api/Transaction/Search'
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'Content-Type': 'application/json'
}
payload= {"postType":"Both","day":"Day1095","sort":"InsOrRegDate","order":"Descending","size":24,"offset":"24","pageSource":"search","gclid":"Cj0KCQjwnbmaBhD-ARIsAGTPcfVae1prjf_9aKh0dbnaBbzYvi3VhKn4qEXDAQJMS6ZvOiet8GLqzaAaAqH_EALw_wcB","q":"3qoOuFNgwUeioKQCtZ9KFA"}
lst = []
for p in range(0,240,24):
payload['offset'] = p
res=requests.post(api_url,headers=headers,json=payload)
for item in res.json()['data']:
d = {
'flat':item['xAxis']
}
lst.append(d)
df = pd.DataFrame(lst)
print(df)
Output:
flat
0 A室
1 B室
2 B室
3 2室
4 C室
.. ...
235 E室
236 F室
237 E室
238 8室
239 6室
[240 rows x 1 columns]

Related

How to select only a link while web scrapping a HTML which the attribute has also text?

As a part of a a bigger webscrapping project, I want to extract the html link from a html. It is not all html link on the page, but only the in the second column of the big table.
An example of how the html these links appear look like:
18955/989/20
I would like to have a list the "exibir?proc=18955/989/20&offset=0" and NOT the "18955/989/20".
So far, i could only get them both together (code bellow). How can I get rid of it? Is there another solution? At the end I would like to have only a list of the links in the order they already appear.
from requests_html import HTMLSession
import csv
s = HTMLSession()
def get_product_links(page):
url = f"https://www.tce.sp.gov.br/jurisprudencia/pesquisar?txtTdPalvs=munic%C3%ADpio+pessoal+37&txtExp=temporari&txtQqUma=admiss%C3%A3o+contrata%C3%A7%C3%A3o&txtNenhPalvs=&txtNumIni=&txtNumFim=&tipoBuscaTxt=Documento&_tipoBuscaTxt=on&quantTrechos=1&processo=&exercicio=&dataAutuacaoInicio=&dataAutuacaoFim=&dataPubInicio=01%2F01%2F2021&dataPubFim=31%2F12%2F2021&_relator=1&_auditor=1&_materia=1&tipoDocumento=2&_tipoDocumento=1&acao=Executa&offset={page}"
links = []
r = s.get(url)
products = r.html.find('td.small a')
for item in products:
links.append(item.find('a', first=True).attrs['href'])
return links
page1 = get_product_links(0)
print(page1)
Here is one way to get those links from the second column. You're welcome to functionalize it if you want.
from bs4 import BeautifulSoup as bs
import requests
from tqdm import tqdm ## if using Jupyter: from tqdm.notebook import tqdm
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(0, 410, 10)):
url = f'https://www.tce.sp.gov.br/jurisprudencia/pesquisar?txtTdPalvs=munic%C3%ADpio+pessoal+37&txtExp=temporari&txtQqUma=admiss%C3%A3o+contrata%C3%A7%C3%A3o&txtNenhPalvs=&txtNumIni=&txtNumFim=&tipoBuscaTxt=Documento&_tipoBuscaTxt=on&quantTrechos=1&processo=&exercicio=&dataAutuacaoInicio=&dataAutuacaoFim=&dataPubInicio=01%2F01%2F2021&dataPubFim=31%2F12%2F2021&_relator=1&_auditor=1&_materia=1&tipoDocumento=2&_tipoDocumento=1&acao=Executa&offset={x}'
r = s.get(url)
urls = bs(r.text, 'html.parser').select('tr[class="borda-superior"] td:nth-of-type(2) a')
big_list.extend([(x.text.strip(), 'https://www.tce.sp.gov.br/jurisprudencia/' + x.get('href')) for x in urls])
df = pd.DataFrame(big_list, columns = ['title', 'url'])
print(df)
Result in terminal:
100%
41/41 [00:30<00:00, 1.42it/s]
title url
0 18955/989/20 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=18955/989/20&offset=0
1 13614/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=13614/989/18&offset=0
2 6269/989/19 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=6269/989/19&offset=0
3 14011/989/19 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=14011/989/19&offset=0
4 14082/989/19 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=14082/989/19&offset=0
... ... ...
399 4023/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=4023/989/18&offset=390
400 4024/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=4024/989/18&offset=400
401 4025/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=4025/989/18&offset=400
402 4026/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=4026/989/18&offset=400
403 4027/989/18 https://www.tce.sp.gov.br/jurisprudencia/exibir?proc=4027/989/18&offset=400
404 rows × 2 columns
Edit: If you want only the (partial) url as a list, all you have to do is:
from bs4 import BeautifulSoup as bs
import requests
from tqdm import tqdm ## if Jupyter: from tqdm.notebook import tqdm
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(0, 410, 10)):
url = f'https://www.tce.sp.gov.br/jurisprudencia/pesquisar?txtTdPalvs=munic%C3%ADpio+pessoal+37&txtExp=temporari&txtQqUma=admiss%C3%A3o+contrata%C3%A7%C3%A3o&txtNenhPalvs=&txtNumIni=&txtNumFim=&tipoBuscaTxt=Documento&_tipoBuscaTxt=on&quantTrechos=1&processo=&exercicio=&dataAutuacaoInicio=&dataAutuacaoFim=&dataPubInicio=01%2F01%2F2021&dataPubFim=31%2F12%2F2021&_relator=1&_auditor=1&_materia=1&tipoDocumento=2&_tipoDocumento=1&acao=Executa&offset={x}'
r = s.get(url)
urls = bs(r.text, 'html.parser').select('tr[class="borda-superior"] td:nth-of-type(2) a')
big_list.extend([x.get('href') for x in urls])
print(big_list)
Result in terminal:
100%
41/41 [00:29<00:00, 1.83it/s]
['exibir?proc=18955/989/20&offset=0',
'exibir?proc=13614/989/18&offset=0',
'exibir?proc=6269/989/19&offset=0',
'exibir?proc=14011/989/19&offset=0',
'exibir?proc=14082/989/19&offset=0',
'exibir?proc=14238/989/19&offset=0',
'exibir?proc=14141/989/20&offset=0',
'exibir?proc=15371/989/19&offset=0',
'exibir?proc=15388/989/20&offset=0',
'exibir?proc=12911/989/16&offset=0',
'exibir?proc=1735/002/11&offset=10',
'exibir?proc=23494/989/18&offset=10',
'exibir?proc=24496/989/19&offset=10',
'exibir?proc=17110/989/18&offset=10',
'exibir?proc=24043/989/19&offset=10',
'exibir?proc=2515/989/20&offset=10',
'exibir?proc=1891/989/20&offset=10',
'exibir?proc=15026/989/20&offset=10',
'exibir?proc=9070/989/20&offset=10',
'exibir?proc=21543/989/20&offset=10',
'exibir?proc=19654/989/20&offset=20',
'exibir?proc=19678/989/20&offset=20',
'exibir?proc=5714/989/19&offset=20',
'exibir?proc=20493/989/20&offset=20',
'exibir?proc=4671/989/19&offset=20',
'exibir?proc=14200/989/20&offset=20',
'exibir?proc=15277/989/20&offset=20',
'exibir?proc=1363/007/12&offset=20',
'exibir?proc=4908/989/19&offset=20',
'exibir?proc=15164/989/20&offset=20',
'exibir?proc=4418/989/19&offset=30',
'exibir?proc=4890/989/19&offset=30',
'exibir?proc=17924/989/20&offset=30',
'exibir?proc=4742/989/19&offset=30',
'exibir?proc=800226/465/09&offset=30',
'exibir?proc=23880/989/20&offset=30',
'exibir?proc=4561/989/19&offset=30',
'exibir?proc=4540/989/19&offset=30',
'exibir?proc=4471/989/19&offset=30',
'exibir?proc=4982/989/19&offset=30',
'exibir?proc=4519/989/19&offset=40',
'exibir?proc=4632/989/19&offset=40',
'exibir?proc=4536/989/19&offset=40',
'exibir?proc=4622/989/19&offset=40',
'exibir?proc=14734/989/16&offset=40',
'exibir?proc=4678/989/19&offset=40',
'exibir?proc=5501/989/16&offset=40',
'exibir?proc=13988/989/17&offset=40',
'exibir?proc=4854/989/19&offset=40',
'exibir?proc=4609/989/19&offset=40',
'exibir?proc=4717/989/19&offset=50',
'exibir?proc=4673/989/19&offset=50',
'exibir?proc=20988/989/20&offset=50',
'exibir?proc=4481/989/19&offset=50',
'exibir?proc=4675/989/19&offset=50',
'exibir?proc=4451/989/19&offset=50',
'exibir?proc=12943/989/19&offset=50',
'exibir?proc=23644/989/18&offset=50',
'exibir?proc=23875/989/18&offset=50',
'exibir?proc=4679/989/19&offset=50',
'exibir?proc=4425/989/19&offset=60',
'exibir?proc=2726/989/18&offset=60',
'exibir?proc=17172/989/20&offset=60',
'exibir?proc=2901/989/18&offset=60',
'exibir?proc=4469/989/19&offset=60',
'exibir?proc=299/012/13&offset=60',
'exibir?proc=4915/989/19&offset=60',
'exibir?proc=22649/989/20&offset=60',
'exibir?proc=22887/989/20&offset=60',
'exibir?proc=4721/989/19&offset=60',
'exibir?proc=4378/989/19&offset=70',
'exibir?proc=4935/989/19&offset=70',
'exibir?proc=4714/989/19&offset=70',
'exibir?proc=1230/989/21&offset=70',
'exibir?proc=1847/989/21&offset=70',
'exibir?proc=15606/989/21&offset=70',
'exibir?proc=11267/989/18&offset=70',
'exibir?proc=1232/004/12&offset=70',
'exibir?proc=4421/989/19&offset=70',
'exibir?proc=4931/989/19&offset=70',
'exibir?proc=4885/989/19&offset=80',
'exibir?proc=5002/989/19&offset=80',
'exibir?proc=21592/989/20&offset=80',
'exibir?proc=4839/989/19&offset=80',
'exibir?proc=4783/989/19&offset=80',
'exibir?proc=4599/989/19&offset=80',
'exibir?proc=4702/989/19&offset=80',
'exibir?proc=4617/989/19&offset=80',
'exibir?proc=4970/989/16&offset=80',
'exibir?proc=4492/989/19&offset=80',
'exibir?proc=2582/989/17&offset=90',
'exibir?proc=4993/989/19&offset=90',
'exibir?proc=4658/989/19&offset=90',
'exibir?proc=4606/989/19&offset=90',
'exibir?proc=4387/989/19&offset=90',
'exibir?proc=14549/989/19&offset=90',
'exibir?proc=4525/989/18&offset=90',
'exibir?proc=4713/989/19&offset=90',
'exibir?proc=838/001/14&offset=90',
'exibir?proc=4971/989/19&offset=90',
'exibir?proc=17505/989/18&offset=100',
'exibir?proc=5096/989/18&offset=100',
'exibir?proc=4413/989/19&offset=100',
'exibir?proc=4392/989/19&offset=100',
'exibir?proc=15132/989/20&offset=100',
'exibir?proc=4517/989/19&offset=100',
'exibir?proc=4760/989/19&offset=100',
'exibir?proc=18509/989/19&offset=100',
'exibir?proc=4952/989/19&offset=100',
'exibir?proc=5013/989/19&offset=100',
'exibir?proc=12922/989/19&offset=110',
'exibir?proc=6194/989/16&offset=110',
'exibir?proc=19323/989/20&offset=110',
'exibir?proc=4732/989/19&offset=110',
...]

How can I web scrape a government website with python? I cannot properly do it, the table just cannot show

I am trying to web scrape the data from this government website: https://www.itf.gov.hk/en/project-search/search-result/index.html?isAdvSearch=1&Programmes=TVP
However, after reading a lot about web scrapping, following Youtube video, I still can’t do it. Can someone please help?
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://www.itf.gov.hk/en/project-search/project-profile/index.html?ReferenceNo=TVP/2122/22'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
soup
table = soup.find('table',{'class':'colorTbl projDetailTbl'})
headers=[]
for i in table.find_all('th'):
title = i.text.strip()
headers.append(title)
df = pd.DataFrame(columns=headers)
for row in table.find_all('tr')[1:]:
data = row.find_all('td')
row_data = [td.text.strip() for td in data]
length = len(df)
df.loc[length] = row_data
The table didn't show at all. The result is none. Please help me.
The table is rendered through javascript, and the data returned through an api. You need to get the data from the source.
Once you have the "Reference", you then can feed those into the api again to get the "linked" data. And finally merge them together.
Code:
import pandas as pd
import requests
tokenUrl = 'https://www.itf.gov.hk/API/Token/Get'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
token = requests.get(tokenUrl, headers=headers).text
url = 'https://www.itf.gov.hk/API/Project/Search'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
'verificationtoken': token}
# Get Full Tables
page = 1
rows = []
while True:
payload = {
'Page': '%s' %page,
'Programmes[]': 'TVP'}
jsonData = requests.post(url, headers=headers, data=payload).json()
rows += jsonData['Records']
page += 1
print(f"{len(rows)} of {jsonData['Total']}")
if len(rows) == jsonData['Total']:
print('Complete')
break
df = pd.DataFrame(rows)
references = list(df['Reference'])
# Use References to get "linked" data
refRows = []
for ref in references:
print(ref)
url = 'https://www.itf.gov.hk/API/Project/Get'
payload = {
'Reference':ref}
jsonData = requests.post(url, headers=headers, data=payload).json()
TechnologicalSolutions = jsonData['TechnologicalSolutions'][0]
row = jsonData
row.update(TechnologicalSolutions)
refRows.append(row)
refDf = pd.DataFrame(refRows)
# Merge together
df = df.merge(refDf, how='left', on = ['Reference'])
Output:
print(df)
Reference ... SCName
0 TVP/2122/22 ... 销售点管理系统
1 TVP/2120/22 ... 销售点管理系统
2 TVP/2107/22 ... 企业资源规划方案
3 TVP/2105/22 ... 企业资源规划方案
4 TVP/2103/22 ... 电子库存管理系统
5 TVP/2101/22 ... 文件管理及流动存取系统
6 TVP/2097/22 ... 销售点管理系统
7 TVP/2092/22 ... 企业资源规划方案
8 TVP/2086/22 ... 销售点管理系统
9 TVP/2085/22 ... 企业资源规划方案
[10 rows x 66 columns]
After inspecting the website, I see no such class projDetailTbl. However, you can use the following code to be able to process the data from the website.
table = soup.find('table', class_='colorTbl')
or you can search on id if there happen to be more classes of the same name.
table = soup.find('table', id='searchResultOutput')
I'm afraid that it could not be done as your site generates the table with javascript. The body of the html that you are getting in your soup is without the data:
<body>
<div id="content"><script src="/filemanager/template/common/js/search.js" type="text/javascript"></script>
<h1 class="a_center">Project Profile</h1>
<div class="tableResponsive" id="projectProfile"> 
<table class="colorTbl projDetailTbl">
<tbody></tbody>
</table>
<div class="remarks" id="techAreaRemark" style="display: none;">* The primary technology area as indicated by the project coordinator is placed first.</div>
</div></div>
</body>
That is the soup (part of it) that you are getting with :
url = 'https://www.itf.gov.hk/en/project-search/project-profile/index.html?ReferenceNo=TVP/2122/22'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
The content of the page is filled up afterwords with:
<div> id="content"><script src="/filemanager/template/common/js/search.js" type="text/javascript"></script> ...
...
</div>
Your code is probably ok but the page you are scraping is not filled with data yet, when you put a request.
Maybe you could try to get it using Selenium.....
Regards...
I used selenium and webdriver_manager to work with javascript execution
To install selenium run pip install selenium and to automatically load the drivers, install webdriver_manager pip install webdriver-manager
Here is my code (worked for me):
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager # automatic webdriver for Chrome browser (can change to your browser)
URL = 'https://www.itf.gov.hk/en/project-search/project-profile/index.html?ReferenceNo=TVP/2122/22'
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml; q=0.9,image/webp,image/apng,*/*;q=0.8"
}
# opening the page and get elements from the table
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options, executable_path=ChromeDriverManager().install())
driver.get(URL)
time.sleep(4) # falling asleep (4 sec) to accurately load the table on the site
# getting the name of the line and its content
data = {}
index_elem = 1
while True:
# through the lines until we reach the non-existent
try:
columns_name = driver.find_element(
By.XPATH, f'//*[#id="projectProfile"]/table/tbody/tr[{index_elem}]/th').text
columns_content = driver.find_element(
By.XPATH, f'//*[#id="projectProfile"]/table/tbody/tr[{index_elem}]/td').text
data[columns_name] = [columns_content]
index_elem += 1
except NoSuchElementException:
break
df = pd.DataFrame(data)
print(df)
Output:
Project Reference ... Technological Solution(s)
0 TVP/2122/22 ... Point-of-Sales (POS) System

Scraping table in python with help of beautifulsoup

I want to scrape 3 tables from this url https://ens.dk/sites/ens.dk/files/OlieGas/mp202112ofu.htm
table 1 : oil production
table 2 : gas production
table 3 : water production
doesnt need to include the charts just 3 tables
I have wrote code to scrape links however not sure how to scrape table from links
import io
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs, SoupStrainer
import re
url = "https://ens.dk/en/our-services/oil-and-gas-related-data/monthly-and-yearly-production"
first_page = requests.get(url)
soup = bs(first_page.content)
def pasrse_page(link):
print(link)
df = pd.read_html(link, skiprows=1, headers=1)
return df
def get_gas_links():
glinks=[]
gas_links = soup.find('table').find_all('a')
for i in gas_links:
extracted_link = i['href']
#you can validate the extracted link however you want
glinks.append("https://ens.dk/" + extracted_link)
return glinks
get_gas_links()
output list of links : list of links
Just use pandas to get the table data and then massage it to your liking.
Here's how:
import requests
import pandas as pd
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:96.0) Gecko/20100101 Firefox/96.0",
}
page = requests.get("https://ens.dk/sites/ens.dk/files/OlieGas/mp202112ofu.htm", headers=headers).text
df = pd.concat(pd.read_html(page, flavor="lxml"))
df.to_csv("oil_tables.csv", index=False)
This will get you a .csv file that looks like this:

Trouble with Beautiful Soup Scraping

I am working on scraping multiple pages of search results from this website into a neatly formated pandas dataframe.
I've outlined the steps for how I am to finish this task.
1.) Identify information from each result I want to pull (3 things)
2.) Pull all the information from the 3 things into separate lists
3.) Append items in lists through for loop into pandas dataframe
Here is what I've tried so far:
import requests
import pandas as pd
#!pip install bs4
from bs4 import BeautifulSoup as bs
url = 'https://www.federalregister.gov/documents/search?conditions%5Bpublication_date%5D%5Bgte%5D=08%2F28%2F2021&conditions%5Bterm%5D=economy'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
result = requests.get(url, headers=headers)
soup = bs(result.text, 'html.parser')
titles = soup.find_all('h5')
authors = soup.find_all('p')
#dates = soup.find_all('')
#append in for loop
data=[]
for i in range(2,22):
data.append(titles[i].text)
data.append(authors[i].text)
#data.append(dates[i].text)
data=pd.DataFrame()
Before I convert data to a pandas dataframe, I can see the results, but the last line essentially erases the results.
Also, I'm not quite sure how to iterate over the multiple search result pages. I found some code that allows you to pick a starting and ending web page to iterate over like this:
URL = ['https://www.federalregister.gov/documents/search?conditions%5Bpublication_date%5D%5Bgte%5D=08%2F28%2F2021&conditions%5Bterm%5D=economy&page=2',
'https://www.federalregister.gov/documents/search?conditions%5Bpublication_date%5D%5Bgte%5D=08%2F28%2F2021&conditions%5Bterm%5D=economy&page=4']
for url in range(0,2):
req = requests.get(URL[url])
soup = bs(req.text, 'html.parser')
titles = soup.find_all('h5')
print(titles)
The issue I'm having with this approach is that the first page is not formatted the same as all the other pages. Starting on page two, the end of the url reads, "&page=2". Not sure how to account for that.
To summarize the end result I'm looking for would be a dataframe that looks something like this:
Title Author Date
Blah1 Agency1 09/23/2020
Blah2 Agency2 08/22/2018
Blah3 Agency3 06/02/2017
....
Can someone please help point me in the right direction? Very lost on this one.
I think you don't need to parse all pages, just download the csv.
import pandas as pd
import requests
import io
url = 'https://www.federalregister.gov/documents/search?conditions%5Bpublication_date%5D%5Bgte%5D=08%2F28%2F2021&conditions%5Bterm%5D=economy'
url += '&format=csv' # <- Download as CSV
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
result = requests.get(url, headers=headers)
df = pd.read_csv(io.StringIO(result.text))
Output:
>>> df
title type ... pdf_url publication_date
0 Corporate Average Fuel Economy Standards for M... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/03/2021
1 Public Hearing for Corporate Average Fuel Econ... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/14/2021
2 Investigation of Urea Ammonium Nitrate Solutio... Notice ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/08/2021
3 Anchorage Regulations; Mississippi River, Mile... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-08... 08/30/2021
4 Call for Nominations To Serve on the National ... Notice ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/08/2021
.. ... ... ... ... ...
112 Endangered and Threatened Wildlife and Plants;... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/07/2021
113 Energy Conservation Program: Test Procedures f... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/01/2021
114 Taking of Marine Mammals Incidental to Commerc... Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/17/2021
115 Partial Approval and Partial Disapproval of Ai... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/24/2021
116 Clean Air Plans; California; San Joaquin Valle... Proposed Rule ... https://www.govinfo.gov/content/pkg/FR-2021-09... 09/01/2021
[117 rows x 8 columns]
If I understand your question, then here is the working solution. The starting url and the url with page number = 1 are the same thing and I scrape page range(1,5) meaning 4 pages. You can increase or decrease range of page numbers at any time. To store data in csv format, please uncomment the last line.
Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
data = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'}
for page in range(1, 5):
url = 'https://www.federalregister.gov/documents/search?conditions%5Bpublication_date%5D%5Bgte%5D=08%2F28%2F2021&conditions%5Bterm%5D=economy%27&page={page}'.format(page=page)
print(url)
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
tags = soup.find_all('div', class_ ='document-wrapper')
for pro in tags:
title = pro.select_one('h5 a').get_text(strip = True)
author = pro.select_one('p a:nth-child(1)').get_text(strip = True)
date = pro.select_one('p a:nth-child(2)').get_text(strip = True)
data.append([title,author,date])
cols = ["Title", "Author","Date"]
df = pd.DataFrame(data,columns=cols)
print(df)
#df.to_csv("data_info.csv", index = False)

Fetching information from the different links on a web page and writing them to a .xls file using pandas,bs4 in Python

I am a beginner to Python Programming. I am practicing web scraping using bs4 module in python.
I have extracted some fields from a web page but it is extracting only 13 items whereas the web page has more than 13 items. I cannot understand why are the rest of the items not extracted.
Another thing is I want to extract the contact number and the email address of each item on the web page but they are available in the item's respective links. I am a beginner and frankly speaking I got stuck to how to access and scrape the link of each item's individual web page within a given web page. Kindly tell where am I doing wrong and if possible suggest what is to be done.
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
res = requests.post('https://www.nelsonalexander.com.au/real-estate-agents/?office=&agent=A')
soup = bs(res.content, 'lxml')
data = soup.find_all("div",{"class":"agent-card large large-3 medium-4 small-12 columns text-center end"})
records = []
for item in data:
name = item.find('h2').text.strip()
position = item.find('h3').text.strip()
records.append({'Names': name, 'Position': position})
df = pd.DataFrame(records,columns=['Names','Position'])
df=df.drop_duplicates()
df.to_excel(r'C:\Users\laptop\Desktop\NelsonAlexander.xls', sheet_name='MyData2', index = False, header=True)
I have done the above code for just extracting the names and the position of each items but it only scrapes 13 records but there are more records than it in the web page. I could not write any code for extracting the contact number and the email address of each record because it is present inside each item's individual page as I have got stuck.
The Excel sheet looks like this:
That website is loading the list dynamically as you scroll, however you can trace the ajax request, and parse data directly:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', 'Referer': 'https://www.nelsonalexander.com.au/real-estate-agents/?office=&agent=A'}
records = []
with requests.Session() as s:
for i in range(1, 22):
res = s.get(f'https://www.nelsonalexander.com.au/real-estate-agents/page/{i}/?ajax=1&agent=A', headers=headers)
soup = bs(res.content, 'lxml')
data = soup.find_all("div",{"class":"agent-card large large-3 medium-4 small-12 columns text-center end"})
for item in data:
name = item.find('h2').text.strip()
position = item.find('h3').text.strip()
phone = item.find("div",{"class":"small-6 columns text-left"}).find("a").get('href').replace("tel:", "")
records.append({'Names': name, 'Position': position, 'Phone': phone})
df = pd.DataFrame(records,columns=['Names','Position', 'Phone'])
df=df.drop_duplicates()
df.to_excel(r'C:\Users\laptop\Desktop\NelsonAlexander.xls', sheet_name='MyData2', index = False, header=True)
I am convinced that the emails are no where in the DOM. I made some modification to #drec4s code to instead go until there are no entries (dynamically).
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import itertools
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', 'Referer': 'https://www.nelsonalexander.com.au/real-estate-agents/?office=&agent=A'}
records = []
with requests.Session() as s:
for i in itertools.count():
res = s.get('https://www.nelsonalexander.com.au/real-estate-agents/page/{}/?ajax=1&agent=A'.format(i), headers=headers)
soup = bs(res.content, 'lxml')
data = soup.find_all("div",{"class":"agent-card large large-3 medium-4 small-12 columns text-center end"})
if(len(data) > 0):
for item in data:
name = item.find('h2').text.strip()
position = item.find('h3').text.strip()
phone = item.find("div",{"class":"small-6 columns text-left"}).find("a").get('href').replace("tel:", "")
records.append({'Names': name, 'Position': position, 'Phone': phone})
print({'Names': name, 'Position': position, 'Phone': phone})
else:
break

Categories

Resources