# Import libs
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
# Form Data for passing to the request body
formdata = {'objid': '14'}
# URL
url = "https://www.sec.kerala.gov.in/public/getalllbcmp/byd"
# Query
for i in range(1, 15):
formdata["objid"] = str(i)
response = requests.request("POST", url, data=formdata, timeout=1500)
out = response.content
soup = BeautifulSoup(out,"html.parser")
bat = json.loads(soup.text)
df = pd.DataFrame(bat["ops1"])
df.to_csv(str(i) + ".csv")
Right now this query creates 14 csv files. What I wanted is, the for loop to remove the first row of column headers and append the data to a dataframe I created outside the for loop. so that I can get it as single csv file.
I am using BS and Pandas.
This is one way of achieving your goal:
# Import libs
import pandas as pd
import requests
from tqdm import tqdm ## if using jupyter: from tqdm.notebook import tqdm
final_df = pd.DataFrame()
# URL
url = "https://www.sec.kerala.gov.in/public/getalllbcmp/byd"
# Query
for i in tqdm(range(1, 15)):
formdata = {'objid': i}
r = requests.post(url, data=formdata)
df = pd.json_normalize(r.json()["ops1"])
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)
final_df.to_csv('some_data_saved.csv')
print(final_df)
Data will be saved to a csv file, and also printed in terminal:
100%
14/14 [00:14<00:00, 1.05s/it]
value text
0 8o7LEdvX2e G14001-Kumbadaje
1 jw2XOQyZ4K G14002-Bellur
2 0lMB1O4LbV G14003-Karadka
3 zodLro2Z39 G14004-Muliyar
4 dWxLYn8ZME G14005-Delampady
... ... ...
1029 Qy6Z09bBKE G01073-Ottoor
1030 ywoXG8wLxV M01001-Neyyattinkara
1031 Kk8Xvz7XO9 M01002-Nedumangad
1032 r7eXQYgX8m M01003-Attingal
1033 b3KXlO2B8g M01004-Varkala
1034 rows × 2 columns
Requests can return responses in JSON format, so you don;t need to import bs4 & json.
For TQDM, please see https://pypi.org/project/tqdm/
For pandas documentation, visit https://pandas.pydata.org/docs/
Also for Requests: https://requests.readthedocs.io/en/latest/
I would use a function to get the data and return a DataFrame, then use it within concat:
def get_data(i):
formdata["objid"] = str(i)
response = requests.request("POST", url, data=formdata, timeout=1500)
out = response.content
soup = BeautifulSoup(out,"html.parser")
bat = json.loads(soup.text)
return pd.DataFrame(bat["ops1"])
df = pd.concat([get_data(i) for i in range(1, 15)])
df.to_csv('all_data.csv')
NB. if this gives you unsatisfactory results, please provide a short extract of 2/3 dataframes and the expected merged output.
Related
I want to crawl all advertisements but output is "unmatched '}'". Is there any easy way to do it? I tried Beautifulsoup before but I think It's not correct way to do it or I'm using it wrong way.
How can I scrape all '199 yeni tikili binalar' from the website.
from ast import literal_eval
from bs4 import BeautifulSoup as bs
import requests
import re
import json
import requests
import pandas as pd
from ast import literal_eval
url = "https://korter.az/yasayis-kompleksleri-baku"
html_doc = requests.get(url).text
data = re.search(r'2804\.jpg"\}\}\}\],(".*")', html_doc).group(1)
data = json.loads(literal_eval(data))
df = pd.DataFrame(data)
df.to_excel('korter.xlsx', index=False)
The site has an api which can be accessed by request
Url of the API is : "https://korter.az/api/building/listing?mainGeoObjectId=1&page=1&lang=az-AZ&locale=az-AZ"
Full Code
import requests
import math
import pandas as pd
def roundup(x):
return int(math.ceil(x / 20.0)) * 20
# Gettig no of results
url1 = f"https://korter.az/api/building/listing?mainGeoObjectId=1&page=1&lang=az-AZ&locale=az-AZ"
r = requests.get(url1)
no_of_outcomes = r.json()["totalBuildingsCount"]
# since the data is 199 i am rounding up to 20 since i will divide no of outcomes by 20 as the api only provides with 20 results at a time
no_of_outcomes = roundup(no_of_outcomes)
# Getting Sub Url from each Page by looping.
result_url = []
previous_subdata = []
for k in range(1, int(no_of_outcomes/20)+1):
url = f"https://korter.az/api/building/listing?mainGeoObjectId=1&page={k}&lang=az-AZ&locale=az-AZ"
r = requests.get(url)
subdata = r.json()["buildings"]
for i in subdata:
suburl = "https://korter.az"+i["url"]
result_url.append(suburl)
print(len(result_url))
df = pd.DataFrame(result_url)
print(df)
Output
199
0
0 https://korter.az/toca-residence-baki
1 https://korter.az/malibu-residence-baki
2 https://korter.az/zirve-park-baki
3 https://korter.az/melissa-park-baki
4 https://korter.az/white-hotel-baki
.. ...
194 https://korter.az/yasham-boulevard-baki
195 https://korter.az/koroglu-baki
196 https://korter.az/luxor-palace-baki
197 https://korter.az/shirvanshahlar-residence-baki
198 https://korter.az/baki-baglari-baki
[199 rows x 1 columns]
Hope this helps. Happy Coding :)
I want to read different pages from the link below with different numbers using ThreadPoolExecutor and save the related numbers to a dataframe as a new column.
https://booking.snav.it/api/v1/rates/1030/2019-02-25/1042/2019-02-25?lang=1
The numbers change as below:
from concurrent.futures import ThreadPoolExecutor, as_completed
from pandas import json_normalize
import pandas as pd
import requests
def download_file(url):
url_info = requests.get(url, stream=True)
jdata = url_info.json()
return jdata
nums = [1030,1031,1040,1050,1020,1021,1010,1023]
urls= [f"https://booking.snav.it/api/v1/rates/{i}/2019-02-25/1042/2019-02-25?lang=1" for i in nums]
with ThreadPoolExecutor(max_workers=14) as executor:
for url in urls:
sleep(0.1)
processes.append(executor.submit(download_file, url))
for index, task in enumerate(as_completed(processes)):
jdata = task.result()
tmp = json_normalize(jdata)
tmp["num"] = nums[index]
df = df.append(tmp)
print(df.head())
In the code above I have tried to read the data using multi-threading and and the related number to each json response as a new column of df dataframe. But this code does not work, because of using multi-threading the order of nums's numbers are not the same as scraped json responses. What should I do?
Try this:
from concurrent.futures import ThreadPoolExecutor
...
with ThreadPoolExecutor(max_workers=14) as executor:
rv = executor.map(download_file, urls)
for index, jdata in enumerate(rv):
tmp = json_normalize(jdata)
tmp["num"] = nums[index]
df.append(tmp)
print(df.head())
I am trying to convert multiple html tables to a pandas dataframe,
For this task I've defined a function to return all these html tables as a pandas dataframe,
However the function returns a null list [] when the idea is that it returns a pandas dataframe.
Here's what I've tried so far:
Getting all the needed links as a list
import requests
from bs4 import BeautifulSoup
import lxml
import html5lib
import pandas as pd
import string
### defining a list for all the needed links ###
first_url='https://www.salario.com.br/tabela-salarial/?cargos='
second_url='#listaSalarial'
allTheLetters = string.ascii_uppercase
links = []
for letter in allTheLetters:
links.append(first_url+letter+second_url)
defining a function
### defining function to parse html objects ###
def getUrlTables(links):
for link in links:
# requesting link, parsing and finding tag:table #
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
tab_div = soup.find_all('table', {'class':'listas'})
# writing html files into directory #
with open('listas_salariales.html', "w") as file:
file.write(str(tab_div))
file.close
# reading html file as a pandas dataframe #
tables=pd.read_html('listas_salariales.html')
return tables
Testing output
getUrlTables(links)
[]
Am I missing something in getUrlTables()?
Is there an easier way to accomplish this task?
The following code will fetch the HTML from all the links, parse them to extract the table data and construct a large combined dataframe (I have not stored the intermediate dataframes to the disk, which might be needed if the size of the tables become too large):
import requests
from bs4 import BeautifulSoup
import lxml
import html5lib
import pandas as pd
import string
### defining a list for all the needed links ###
first_url='https://www.salario.com.br/tabela-salarial/?cargos='
second_url='#listaSalarial'
allTheLetters = string.ascii_uppercase
links = []
for letter in allTheLetters:
links.append(first_url+letter+second_url)
### defining function to parse html objects ###
def getUrlTables(links, master_df):
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.content, 'lxml') # using the lxml parser
try:
table = soup.find('table', attrs={'class':'listas'})
# finding table headers
heads = table.find('thead').find('tr').find_all('th')
colnames = [hdr.text for hdr in heads]
#print(colnames)
# Now extracting the values
data = {k:[] for k in colnames}
rows = table.find('tbody').find_all('tr')
for rw in rows:
for col in colnames:
cell = rw.find('td', attrs={'data-label':'{}'.format(col)})
data[col].append(cell.text)
# Constructing a pandas dataframe using the data just parsed
df = pd.DataFrame.from_dict(data)
master_df = pd.concat([master_df, df], ignore_index=True)
except AttributeError as e:
print('No data from the link: {}'.format(link))
return master_df
master_df = pd.DataFrame()
master_df = getUrlTables(links, master_df)
print(master_df)
The output from the above code is as follows:
CBO Cargo ... Teto Salarial Salário Hora
0 612510 Abacaxicultor ... 2.116,16 6,86
1 263105 Abade ... 5.031,47 17,25
2 263105 Abadessa ... 5.031,47 17,25
3 622020 Abanador na Agricultura ... 2.075,81 6,27
4 862120 Abastecedor de Caldeira ... 3.793,98 11,65
... ... ... ... ... ...
9345 263110 Zenji (missionário) ... 3.888,52 12,65
9346 723235 Zincador ... 2.583,20 7,78
9347 203010 Zoologista ... 4.615,45 14,21
9348 203010 Zoólogo ... 4.615,45 14,21
9349 223310 Zootecnista ... 5.369,59 16,50
[9350 rows x 8 columns]
I am creating a web scraping program using python, BeautifulSoup, pandas and Google Sheets.
Up until now I have managed to scrape data tables from urls which I’m getting from a list in Google sheets - I have created data frames for each dataset. From my list of urls, some of the cells in the column is empty, which gives me the following error when I try to import the dataframes into another sheet:
MissingSchema: Invalid URL '': No schema supplied. Perhaps you meant
http://?
What I’d like to achieve is, that for every cell that’s empty in the sheets with urls, I would like to create an empty dataframe, just like the ones with data inside them. Is that possible?
My code so far looks like this:
import gspread
from df2gspread import df2gspread as d2g
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession
from bs4 import BeautifulSoup
import pandas as pd
import requests
credentials = service_account.Credentials.from_service_account_file(
'credentials.json')
scoped_credentials = credentials.with_scopes(
['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']
)
gc = gspread.Client(auth=scoped_credentials)
gc.session = AuthorizedSession(scoped_credentials)
spreadsheet_key = gc.open_by_key('api_key')
# Data import
data_worksheet = spreadsheet_key.worksheet("Data")
# Url's
url_worksheet = spreadsheet_key.worksheet("Urls")
link_list = url_worksheet.col_values(2)
def get_info(linkIndex) :
page = requests.get(link_list[linkIndex])
soup = BeautifulSoup(page.content, 'html.parser')
try :
tbl = soup.find('table')
labels = []
results = []
for tr in tbl.findAll('tr'):
headers = [th.text.strip() for th in tr.findAll('th')]
data = [td.text.strip() for td in tr.findAll('td')]
labels.append(headers)
results.append(data)
final_results = []
for final_labels, final_data in zip(labels, results):
final_results.append({'Labels': final_labels, 'Data': final_data})
df = pd.DataFrame(final_results)
df['Labels'] = df['Labels'].str[0]
df['Data'] = df['Data'].str[0]
indexNames = df[df['Labels'] == 'Links'].index
df.drop(indexNames , inplace=True)
set_with_dataframe(data_worksheet, df, col=(linkIndex*6)+1, row=2,
include_column_header=False)[1:]
except Exception as e:
print(e)
for linkInd in range(len(link_list))[1:] :
get_info(linkInd)
It depends on what do you mean by an empty dataframe. If that's dataframe containing no data, it can be created with statement pd.DataFrame(). If that's dataframe containing np.NaN / None values in same columns as other dataframes, it can be created from a dict:
import pandas as pd
# x is the amount of rows in dataframe
d = {
'column1': [np.NaN] * x,
'column2': [np.NaN] * x,
'column3': [np.NaN] * x
}
df = pd.DataFrame(d)
In the beginning of get_info() function there should be a check added:
if link_list[linkIndex] is not None: # or if link_list[linkIndex] != '' (depending on format of an empty cell)
In if section should be placed already existing logic, in else section an empty dataframe should be created. Function set_with_dataframe() should be called after if / else statement, because it's executed in both cases.
I do know that python has the read_json function to effectively get data from an api into a pandas dataframe. But is there any way to actually read through all the pages of the api and input it into the same dataframe.
import requests
import pandas as pd
import config
api_key = config.api_key
url = " http://api.themoviedb.org/3/discover/movie?release_date.gte=2017-12-
01&release_date.lte=2017-12-31&api_key=" + api_key
payload = "{}"
response = requests.request("GET", url, data=payload)
print(response.text.encode("utf-8"))
I tried with the requests method but this only gives me the 1st page of the api. But I wanted to see if there is any way I can do it with the df method as below. I am unable to understand how to write a loop to effectively loop over all the pages and then input it all into 1 dataframe for further analysis.
df = pd.read_json('http://api.themoviedb.org/3/discover/movie?
release_date.gte=2017-12-01&release_date.lte=2017-12-
31&api_key=''&page=%s' %page)
You can read each page into a dataframe and concatenate them:
page = 0
df = []
while True:
try:
next_page = pd.read_json('http://api.themoviedb.org/3/discover/movie?
release_date.gte=2017-12-01&release_date.lte=2017-12-
31&api_key=''&page=%s' %page)
# doesn't get any content, stop
if len(next_page) == 0:
break
else:
# move on to the next page
df.append(next_page)
page += 1
except:
# if we got error from the API call, maybe the URL for that page doesn't exist
# the stop
break
df = pd.concat(df, axis=0)
Documentation for pd.concat here. Hope it helps :)