Add a column during getting data with ThreadPoolExecutor in Python - python

I want to read different pages from the link below with different numbers using ThreadPoolExecutor and save the related numbers to a dataframe as a new column.
https://booking.snav.it/api/v1/rates/1030/2019-02-25/1042/2019-02-25?lang=1
The numbers change as below:
from concurrent.futures import ThreadPoolExecutor, as_completed
from pandas import json_normalize
import pandas as pd
import requests
def download_file(url):
url_info = requests.get(url, stream=True)
jdata = url_info.json()
return jdata
nums = [1030,1031,1040,1050,1020,1021,1010,1023]
urls= [f"https://booking.snav.it/api/v1/rates/{i}/2019-02-25/1042/2019-02-25?lang=1" for i in nums]
with ThreadPoolExecutor(max_workers=14) as executor:
for url in urls:
sleep(0.1)
processes.append(executor.submit(download_file, url))
for index, task in enumerate(as_completed(processes)):
jdata = task.result()
tmp = json_normalize(jdata)
tmp["num"] = nums[index]
df = df.append(tmp)
print(df.head())
In the code above I have tried to read the data using multi-threading and and the related number to each json response as a new column of df dataframe. But this code does not work, because of using multi-threading the order of nums's numbers are not the same as scraped json responses. What should I do?

Try this:
from concurrent.futures import ThreadPoolExecutor
...
with ThreadPoolExecutor(max_workers=14) as executor:
rv = executor.map(download_file, urls)
for index, jdata in enumerate(rv):
tmp = json_normalize(jdata)
tmp["num"] = nums[index]
df.append(tmp)
print(df.head())

Related

Appending Dataframe to another dataframe with first row removed

# Import libs
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
# Form Data for passing to the request body
formdata = {'objid': '14'}
# URL
url = "https://www.sec.kerala.gov.in/public/getalllbcmp/byd"
# Query
for i in range(1, 15):
formdata["objid"] = str(i)
response = requests.request("POST", url, data=formdata, timeout=1500)
out = response.content
soup = BeautifulSoup(out,"html.parser")
bat = json.loads(soup.text)
df = pd.DataFrame(bat["ops1"])
df.to_csv(str(i) + ".csv")
Right now this query creates 14 csv files. What I wanted is, the for loop to remove the first row of column headers and append the data to a dataframe I created outside the for loop. so that I can get it as single csv file.
I am using BS and Pandas.
This is one way of achieving your goal:
# Import libs
import pandas as pd
import requests
from tqdm import tqdm ## if using jupyter: from tqdm.notebook import tqdm
final_df = pd.DataFrame()
# URL
url = "https://www.sec.kerala.gov.in/public/getalllbcmp/byd"
# Query
for i in tqdm(range(1, 15)):
formdata = {'objid': i}
r = requests.post(url, data=formdata)
df = pd.json_normalize(r.json()["ops1"])
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)
final_df.to_csv('some_data_saved.csv')
print(final_df)
Data will be saved to a csv file, and also printed in terminal:
100%
14/14 [00:14<00:00, 1.05s/it]
value text
0 8o7LEdvX2e G14001-Kumbadaje
1 jw2XOQyZ4K G14002-Bellur
2 0lMB1O4LbV G14003-Karadka
3 zodLro2Z39 G14004-Muliyar
4 dWxLYn8ZME G14005-Delampady
... ... ...
1029 Qy6Z09bBKE G01073-Ottoor
1030 ywoXG8wLxV M01001-Neyyattinkara
1031 Kk8Xvz7XO9 M01002-Nedumangad
1032 r7eXQYgX8m M01003-Attingal
1033 b3KXlO2B8g M01004-Varkala
1034 rows × 2 columns
Requests can return responses in JSON format, so you don;t need to import bs4 & json.
For TQDM, please see https://pypi.org/project/tqdm/
For pandas documentation, visit https://pandas.pydata.org/docs/
Also for Requests: https://requests.readthedocs.io/en/latest/
I would use a function to get the data and return a DataFrame, then use it within concat:
def get_data(i):
formdata["objid"] = str(i)
response = requests.request("POST", url, data=formdata, timeout=1500)
out = response.content
soup = BeautifulSoup(out,"html.parser")
bat = json.loads(soup.text)
return pd.DataFrame(bat["ops1"])
df = pd.concat([get_data(i) for i in range(1, 15)])
df.to_csv('all_data.csv')
NB. if this gives you unsatisfactory results, please provide a short extract of 2/3 dataframes and the expected merged output.

Create empty dataframes in Google Sheets

I am creating a web scraping program using python, BeautifulSoup, pandas and Google Sheets.
Up until now I have managed to scrape data tables from urls which I’m getting from a list in Google sheets - I have created data frames for each dataset. From my list of urls, some of the cells in the column is empty, which gives me the following error when I try to import the dataframes into another sheet:
MissingSchema: Invalid URL '': No schema supplied. Perhaps you meant
http://?
What I’d like to achieve is, that for every cell that’s empty in the sheets with urls, I would like to create an empty dataframe, just like the ones with data inside them. Is that possible?
My code so far looks like this:
import gspread
from df2gspread import df2gspread as d2g
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession
from bs4 import BeautifulSoup
import pandas as pd
import requests
credentials = service_account.Credentials.from_service_account_file(
'credentials.json')
scoped_credentials = credentials.with_scopes(
['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']
)
gc = gspread.Client(auth=scoped_credentials)
gc.session = AuthorizedSession(scoped_credentials)
spreadsheet_key = gc.open_by_key('api_key')
# Data import
data_worksheet = spreadsheet_key.worksheet("Data")
# Url's
url_worksheet = spreadsheet_key.worksheet("Urls")
link_list = url_worksheet.col_values(2)
def get_info(linkIndex) :
page = requests.get(link_list[linkIndex])
soup = BeautifulSoup(page.content, 'html.parser')
try :
tbl = soup.find('table')
labels = []
results = []
for tr in tbl.findAll('tr'):
headers = [th.text.strip() for th in tr.findAll('th')]
data = [td.text.strip() for td in tr.findAll('td')]
labels.append(headers)
results.append(data)
final_results = []
for final_labels, final_data in zip(labels, results):
final_results.append({'Labels': final_labels, 'Data': final_data})
df = pd.DataFrame(final_results)
df['Labels'] = df['Labels'].str[0]
df['Data'] = df['Data'].str[0]
indexNames = df[df['Labels'] == 'Links'].index
df.drop(indexNames , inplace=True)
set_with_dataframe(data_worksheet, df, col=(linkIndex*6)+1, row=2,
include_column_header=False)[1:]
except Exception as e:
print(e)
for linkInd in range(len(link_list))[1:] :
get_info(linkInd)
It depends on what do you mean by an empty dataframe. If that's dataframe containing no data, it can be created with statement pd.DataFrame(). If that's dataframe containing np.NaN / None values in same columns as other dataframes, it can be created from a dict:
import pandas as pd
# x is the amount of rows in dataframe
d = {
'column1': [np.NaN] * x,
'column2': [np.NaN] * x,
'column3': [np.NaN] * x
}
df = pd.DataFrame(d)
In the beginning of get_info() function there should be a check added:
if link_list[linkIndex] is not None: # or if link_list[linkIndex] != '' (depending on format of an empty cell)
In if section should be placed already existing logic, in else section an empty dataframe should be created. Function set_with_dataframe() should be called after if / else statement, because it's executed in both cases.

How to categorize these website as non-working?

I have been developing a program which checks whether the website is working or not. I am fetching URLs from the excel sheet and then pasting results as True(Working) & False(Non-working) in the same excel sheet but for some URLs such as http://www.andrewelliotgroup.com/ and https://www.sovranollc.com/, it is showing website working but there is no data on the website. I want to filter these type of websites with no data into Non-working categories. Apart from Status code usage, what else KPI can I use to prove this. Here is the code:
import http.client as httpc
from urllib.parse import urlparse
import pandas as pd
import xlwings as xw
import smtplib
from xlsxwriter import Workbook
from xlutils import copy
import socket
import requests
from bs4 import BeautifulSoup as BS
socket.getaddrinfo('172.23.24.143', 8080)
x=[]
df = pd.read_excel (r'xyz.xlsx')
df1=pd.DataFrame(df,columns=['URL'])
print(df1)
url_list=df["URL"].tolist()
print(url_list)
for i in url_list:
def checkUrl(i):
if 'http' not in i:
i= 'https://'+i
p = urlparse(i)
if (p.scheme == 'http'):
conn = httpc.HTTPConnection(p.netloc,timeout=15)
else:
conn = httpc.HTTPSConnection(p.netloc,timeout=15)
try:
conn.request('HEAD', p.path)
resp = conn.getresponse()
return resp.status<400:
except OSError:
return False
print(checkUrl(i))
x.append(checkUrl(i))
workbook = Workbook('xyz.xlsx')
Report_Sheet = workbook.add_worksheet()
Report_Sheet.write(0, 1, 'Value')
Report_Sheet.write_column(1, 1, x)
Report_Sheet.write(0,0,'URL')
Report_Sheet.write_column(1, 0, url_list)
workbook.close()

Unable to store pandas data frame as a csv

I am following this tutorial to retrieve data from news sites.
The main function is getDailyNews. It will loop on each news source, request the api, extract the data and dump it to a pandas DataFrame and then export the result into csv file.
But when I ran the code, I am getting an error.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from tqdm import tqdm, tqdm_notebook
from functools import reduce
def getSources():
source_url = 'https://newsapi.org/v1/sources?language=en'
response = requests.get(source_url).json()
sources = []
for source in response['sources']:
sources.append(source['id'])
return sources
def mapping():
d = {}
response = requests.get('https://newsapi.org/v1/sources?language=en')
response = response.json()
for s in response['sources']:
d[s['id']] = s['category']
return d
def category(source, m):
try:
return m[source]
except:
return 'NC'
def getDailyNews():
sources = getSources()
key = '96f279e1b7f845669089abc016e915cc'
url = 'https://newsapi.org/v1/articles?source={0}&sortBy={1}&apiKey={2}'
responses = []
for i, source in tqdm_notebook(enumerate(sources), total=len(sources)):
try:
u = url.format(source, 'top', key)
except:
u = url.format(source, 'latest', key)
response = requests.get(u)
r = response.json()
try:
for article in r['articles']:
article['source'] = source
responses.append(r)
except:
print('Rate limit exceeded ... please wait and retry in 6 hours')
return None
articles = list(map(lambda r: r['articles'], responses))
articles = list(reduce(lambda x,y: x+y, articles))
news = pd.DataFrame(articles)
news = news.dropna()
news = news.drop_duplicates()
news.reset_index(inplace=True, drop=True)
d = mapping()
news['category'] = news['source'].map(lambda s: category(s, d))
news['scraping_date'] = datetime.now()
try:
aux = pd.read_csv('./data/news.csv')
aux = aux.append(news)
aux = aux.drop_duplicates('url')
aux.reset_index(inplace=True, drop=True)
aux.to_csv('./data/news.csv', encoding='utf-8', index=False)
except:
news.to_csv('./data/news.csv', index=False, encoding='utf-8')
print('Done')
if __name__=='__main__':
getDailyNews()
Error:
FileNotFoundError: [Errno 2] No such file or directory: './data/news.csv'
I know that I have to give the path name in pd.read_csv but I don't know which path I have to give here.
This error would make sense if there wasn't already a data folder in the directory you are executing this program from. There is a similar problem in the post here.

How to add all the pages of an api into a pandas dataframe

I do know that python has the read_json function to effectively get data from an api into a pandas dataframe. But is there any way to actually read through all the pages of the api and input it into the same dataframe.
import requests
import pandas as pd
import config
api_key = config.api_key
url = " http://api.themoviedb.org/3/discover/movie?release_date.gte=2017-12-
01&release_date.lte=2017-12-31&api_key=" + api_key
payload = "{}"
response = requests.request("GET", url, data=payload)
print(response.text.encode("utf-8"))
I tried with the requests method but this only gives me the 1st page of the api. But I wanted to see if there is any way I can do it with the df method as below. I am unable to understand how to write a loop to effectively loop over all the pages and then input it all into 1 dataframe for further analysis.
df = pd.read_json('http://api.themoviedb.org/3/discover/movie?
release_date.gte=2017-12-01&release_date.lte=2017-12-
31&api_key=''&page=%s' %page)
You can read each page into a dataframe and concatenate them:
page = 0
df = []
while True:
try:
next_page = pd.read_json('http://api.themoviedb.org/3/discover/movie?
release_date.gte=2017-12-01&release_date.lte=2017-12-
31&api_key=''&page=%s' %page)
# doesn't get any content, stop
if len(next_page) == 0:
break
else:
# move on to the next page
df.append(next_page)
page += 1
except:
# if we got error from the API call, maybe the URL for that page doesn't exist
# the stop
break
df = pd.concat(df, axis=0)
Documentation for pd.concat here. Hope it helps :)

Categories

Resources