Related
I would like to pass 'n' amount of cities to travel to and corresponding days in each city to a function that would return a df with all possible permutations of the journey. The kayak_search_url column in the df should contain this string in the first row:
https://www.kayak.com/flights/AMS-WAW,nearby/2023-02-14/WAW-BOG,nearby/2023-02-17/BOG-MIL,nearby/2023-02-20/MIL-SDQ,nearby/2023-02-23/SDQ-AMS,nearby/2023-02-25/?sort=bestflight_a
...but instead contains this string:
https://www.kayak.com/flights/AMS-WAW,nearby/2023-02-14/AMS-BOG,nearby/2023-02-17/AMS-MIL,nearby/2023-02-20/AMS-SDQ,nearby/2023-02-23/AMS,nearby/2023-02-25/?sort=bestflight_a
I can't figure out why the origin code 'AMS' shows up instead of the chain of cities. Here's the code:
# List the cities you want to travel to and from, how long you'd like to stay in each, and the appropriate start/end dates
start_city = 'Amsterdam'
end_city = 'Amsterdam'
start_date = '2023-02-14'
cities = ['Warsaw', 'Bogota', 'Milan', 'Santo Domingo']
days = [3,3,3,2]
def generate_permutations(cities, days, start_city, end_city, start_date):
city_to_days = dict(zip(cities, days))
permutations = list(itertools.permutations(cities))
df = pd.DataFrame(permutations, columns=['city' + str(i) for i in range(1, len(cities) + 1)])
df['origin'] = start_city
df['end'] = end_city
first_column = df.pop('origin')
df.insert(0, 'origin', first_column)
st_dt = pd.to_datetime(start_date)
df = df.assign(flight_dt_1=st_dt)
for i in range(len(cities)):
df['flight_dt_' + str(i + 2)] = df['flight_dt_' + str(i + 1)] + df['city' + str(i + 1)].map(city_to_days).map(lambda x: pd.Timedelta(days=x))
# IATA city code dictionary from iata_code.csv file in repo and create Kayak 'url' column for each permutation
iata = {'Amsterdam': 'AMS',
'Warsaw': 'WAW',
'Bogota': 'BOG',
'Milan': 'MIL',
'Santo Domingo': 'SDQ'}
url = 'https://www.kayak.com/flights/'
df['kayak_search_url'] = df.apply(lambda x: url + ''.join([iata[x['origin']] + '-' + iata[x['city' + str(i+1)]] + \
',nearby/' + str(x['flight_dt_' + str(i+1)].strftime("%Y-%m-%d")) + '/' \
for i in range(len(cities))]) + iata[x['end']] + ',nearby/' + str(x['flight_dt_' + str(len(cities) + 1)].strftime("%Y-%m-%d")) + \
'/?sort=bestflight_a', axis=1)
return df
Let's break down the desired URL to highlight its structure:
https://www.kayak.com/flights
/AMS-WAW,nearby/2023-02-14
/WAW-BOG,nearby/2023-02-17
/BOG-MIL,nearby/2023-02-20
/MIL-SDQ,nearby/2023-02-23
/SDQ-AMS,nearby/2023-02-25
/?sort=bestflight_a
Obviously only the middle section needs to generated as the other parts are static. We can also generate that middle section before constructing the dataframe:
def generate_permutations(cities, days, start_city, end_city, start_date):
iata = {
"Amsterdam": "AMS",
"Warsaw": "WAW",
"Bogota": "BOG",
"Milan": "MIL",
"Santo Domingo": "SDQ",
}
permutations = [
(start_city,) + p + (end_city,) for p in itertools.permutations(cities)
]
flight_dates = pd.to_datetime(start_date) + pd.to_timedelta(
np.array([0] + days).cumsum(),
unit="D",
)
# Generate the URLs
urls = []
for p in permutations:
# The pattern for each segment is
# START-END,nearby/yyyy-dd-dd
mid_url = "/".join(
[
f"{iata[s]}-{iata[e]},nearby/{fd:%Y-%m-%d}"
for s, e, fd in zip(p[:-1], p[1:], flight_dates)
]
)
urls.append(f"https://www.kayak.com/flights/{mid_url}/?sort=bestflight_a")
# Generate the resulting dataframe
return (
pd.DataFrame(
permutations,
columns=["origin", *[f"city{i+1}" for i in range(len(cities))], "end"],
)
.merge(
pd.DataFrame(
flight_dates,
index=[f"flight_dt_{i+1}" for i in range(len(flight_dates))],
).T,
how="cross",
)
.assign(kayak_search_url=urls)
)
I have a pandas dataframe that I have extracted from a JSON file for breweries I'm interested in. most of these columns are as nested list of dictionaries. However two columns 'hours' and 'memberships' are being problematic.
I'd like to extract the 'hours' column into 7 columns "Mon_Hours","Tue_hours"...'Sun_Hours'.
I have tried and tried to figure this out but these two columns are proving challenging.
Here is a link to the initial data: https://www.coloradobrewerylist.com/wp-json/cbl_api/v1/locations/?location-type%5Bnin%5D=404,405&page_size=1000&page_token=1
and here is my code:
import requests
import re
import pandas as pd
import numpy as np
import csv
import json
from datetime import datetime
### get the data from the Colorado Brewery list
url = "https://www.coloradobrewerylist.com/wp-json/cbl_api/v1/locations/?location-type%5Bnin%5D=404,405&page_size=1000&page_token=1"
payload={}
headers = {}
response = requests.request("GET", url, headers=headers, data=payload)
data=response.json()
### convert results to table
pd.set_option('display.max_columns', None)
brewdf = pd.DataFrame.from_dict(data['results'])
#brewdf
############################################
#### CLEAN UP NESTED LIST-DICT COLUMNS #####
############################################
## cleanup dogs column
dogs = pd.json_normalize(brewdf['dogs'])
dogs2 = dogs.squeeze()
dogsdf = pd.json_normalize(dogs2)
dogsdf = dogsdf.drop(columns =['id','slug'])
dogsdf = dogsdf.rename(columns={'name':'dogs_allowed'})
#dogsdf
## cleanup parking column
parking = pd.json_normalize(brewdf['parking'])
parking = parking.rename(columns = {0:'Parking1',1:'Parking2',2:'Parking3'})
a = pd.json_normalize(parking['Parking1'])
b = pd.json_normalize(parking['Parking2'])
c = pd.json_normalize(parking['Parking3'])
parkcombo = pd.concat([a,b,c],ignore_index=True, axis=1)
parkcombo = parkcombo.rename(columns = {2:'P1',5:'P2',8:'P3'})
parkcombo['parking_type'] = parkcombo['P1'].map(str) + ',' + parkcombo['P2'].map(str) + ',' + parkcombo['P3'].map(str)
parkcombo['parking_type'] = parkcombo['parking_type'].str.replace(",nan",'')
parkdf = parkcombo['parking_type'].to_frame()
#parkdf
## cleanup food type column
food = pd.json_normalize(brewdf['food_type'])
food
food = food.rename(columns = {0:'Food1',1:'Food2',2:'Food3',3:'Food4',4:'Food5',5:'Food6'})
a = pd.json_normalize(food['Food1'])
b = pd.json_normalize(food['Food2'])
c = pd.json_normalize(food['Food3'])
d = pd.json_normalize(food['Food4'])
e = pd.json_normalize(food['Food5'])
f = pd.json_normalize(food['Food6'])
foodcombo = pd.concat([a,b,c,d,e,f],ignore_index=True, axis =1)
foodcombo
foodcombo = foodcombo.rename(columns = {2:'F1',5:'F2',8:'F3',11:'F4',14:'F5',17:'F6'})
foodcombo['food_type'] = foodcombo['F1'].map(str) + ',' + foodcombo['F2'].map(str) + ',' + foodcombo['F3'].map(str) + ',' + foodcombo['F4'].map(str)+ ',' + foodcombo['F5'].map(str) + ',' + foodcombo['F6'].map(str)
foodcombo['food_type'] = foodcombo['food_type'].str.replace(",nan",'')
fooddf = foodcombo['food_type'].to_frame()
#fooddf
## cleanup patio column
patio = pd.json_normalize(brewdf['patio'])
patio = patio.rename(columns = {0:'P1',1:'P2',2:'P3'})
a = pd.json_normalize(patio['P1'])
b = pd.json_normalize(patio['P2'])
c = pd.json_normalize(patio['P3'])
patiocombo = pd.concat([a,b,c],ignore_index=True, axis =1)
patiocombo
patiocombo = patiocombo.rename(columns = {2:'P1',5:'P2',8:'P3'})
patiocombo['patio_type'] = patiocombo['P1'].map(str) + ',' + patiocombo['P2'].map(str) + ',' + patiocombo['P3'].map(str)
patiocombo['patio_type'] = patiocombo['patio_type'].str.replace(",nan",'')
patiodf = patiocombo['patio_type'].to_frame()
#patiodf
## clean visitor type column
visitor = pd.json_normalize(brewdf['visitors'])
visitor
visitor = visitor.rename(columns = {0:'V1',1:'V2',2:'V3'})
a = pd.json_normalize(visitor['V1'])
b = pd.json_normalize(visitor['V2'])
c = pd.json_normalize(visitor['V3'])
visitorcombo = pd.concat([a,b,c],ignore_index=True, axis =1)
visitorcombo
visitorcombo = visitorcombo.rename(columns = {2:'V1',5:'V2',8:'V3'})
visitorcombo['visitor_type'] = visitorcombo['V1'].map(str) + ',' + visitorcombo['V2'].map(str) + ',' + visitorcombo['V3'].map(str)
visitorcombo['visitor_type'] = visitorcombo['visitor_type'].str.replace(",nan",'')
visitordf = visitorcombo['visitor_type'].to_frame()
#visitordf
## clean tour type column
tour = pd.json_normalize(brewdf['tour_type'])
tour
tour = tour.rename(columns = {0:'T1',1:'T2',2:'T3',3:'T4'})
a = pd.json_normalize(tour['T1'])
b = pd.json_normalize(tour['T2'])
c = pd.json_normalize(tour['T3'])
d = pd.json_normalize(tour['T4'])
tourcombo = pd.concat([a,b,c,d],ignore_index=True, axis =1)
tourcombo
tourcombo = tourcombo.rename(columns = {2:'T1',5:'T2',8:'T3',11:'T4'})
tourcombo['tour_type'] = tourcombo['T1'].map(str) + ',' + tourcombo['T2'].map(str) + ',' + tourcombo['T3'].map(str) + ','+ tourcombo['T4'].map(str)
tourcombo['tour_type'] = tourcombo['tour_type'].str.replace(",nan",'')
tourdf = tourcombo['tour_type'].to_frame()
#tourdf
## clean other drinks column
odrink = pd.json_normalize(brewdf['otherdrinks_type'])
odrink
odrink = odrink.rename(columns = {0:'O1',1:'O2',2:'O3',3:'O4',4:'O5',5:'O6',6:'O7',7:'O8',8:'O9'})
a = pd.json_normalize(odrink['O1'])
b = pd.json_normalize(odrink['O2'])
c = pd.json_normalize(odrink['O3'])
d = pd.json_normalize(odrink['O4'])
e = pd.json_normalize(odrink['O5'])
f = pd.json_normalize(odrink['O6'])
g = pd.json_normalize(odrink['O7'])
h = pd.json_normalize(odrink['O8'])
i = pd.json_normalize(odrink['O9'])
odrinkcombo = pd.concat([a,b,c,d,e,f,g,h,i],ignore_index=True, axis =1)
odrinkcombo
odrinkcombo = odrinkcombo.rename(columns = {2:'O1',5:'O2',8:'O3',11:'O4',14:'O5',17:'O6',20:'O7',23:'O8',26:'O9'})
odrinkcombo['odrink_type'] = odrinkcombo['O1'].map(str) + ',' + odrinkcombo['O2'].map(str) + ',' + odrinkcombo['O3'].map(str) + ','+ odrinkcombo['O4'].map(str) + ','+ odrinkcombo['O5'].map(str)+ ','+ odrinkcombo['O6'].map(str)+ ','+ odrinkcombo['O7'].map(str)+','+ odrinkcombo['O8'].map(str)+','+ odrinkcombo['O9'].map(str)
odrinkcombo['odrink_type'] = odrinkcombo['odrink_type'].str.replace(",nan",'')
odrinkdf = odrinkcombo['odrink_type'].to_frame()
#odrinkdf
## clean to-go column
togo = pd.json_normalize(brewdf['togo_type'])
togo
togo = togo.rename(columns = {0:'TG1',1:'TG2',2:'TG3',3:'TG4',4:'TG5'})
a = pd.json_normalize(togo['TG1'])
b = pd.json_normalize(togo['TG2'])
c = pd.json_normalize(togo['TG3'])
d = pd.json_normalize(togo['TG4'])
e = pd.json_normalize(togo['TG5'])
togocombo = pd.concat([a,b,c,d,e],ignore_index=True, axis =1)
togocombo
togocombo = togocombo.rename(columns = {2:'TG1',5:'TG2',8:'TG3',11:'TG4',14:'TG5'})
togocombo['togo_type'] = togocombo['TG1'].map(str) + ',' + togocombo['TG2'].map(str) + ',' + togocombo['TG3'].map(str) + ','+ togocombo['TG4'].map(str) + ','+ togocombo['TG5'].map(str)
togocombo['togo_type'] = togocombo['togo_type'].str.replace(",nan",'')
togodf = togocombo['togo_type'].to_frame()
#togodf
## clean merch column
merch = pd.json_normalize(brewdf['merch_type'])
merch
merch = merch.rename(columns = {0:'M1',1:'M2',2:'M3',3:'M4',4:'M5',5:'M6',6:'M7',7:'M8',8:'M9',9:'M10',10:'M11',11:'M12'})
a = pd.json_normalize(merch['M1'])
b = pd.json_normalize(merch['M2'])
c = pd.json_normalize(merch['M3'])
d = pd.json_normalize(merch['M4'])
e = pd.json_normalize(merch['M5'])
f = pd.json_normalize(merch['M6'])
g = pd.json_normalize(merch['M7'])
h = pd.json_normalize(merch['M8'])
i = pd.json_normalize(merch['M9'])
j = pd.json_normalize(merch['M10'])
k = pd.json_normalize(merch['M11'])
l = pd.json_normalize(merch['M12'])
merchcombo = pd.concat([a,b,c,d,e,f,g,h,i,j,k,l],ignore_index=True, axis =1)
merchcombo
merchcombo = merchcombo.rename(columns = {2:'M1',5:'M2',8:'M3',11:'M4',14:'M5',17:'M6',20:'M7',23:'M8',26:'M9',29:'M10',32:'M11',35:'M12'})
merchcombo['merch_type'] = (merchcombo['M1'].map(str) + ',' + merchcombo['M2'].map(str) + ',' + merchcombo['M3'].map(str) + ','+ merchcombo['M4'].map(str) + ','
+ merchcombo['M5'].map(str) + ',' + merchcombo['M6'].map(str)+ ',' + merchcombo['M7'].map(str) + ',' + merchcombo['M8'].map(str)
+ ',' + merchcombo['M9'].map(str)+ ',' + merchcombo['M10'].map(str)+ ',' + merchcombo['M11'].map(str)+ ',' + merchcombo['M12'].map(str))
merchcombo['merch_type'] = merchcombo['merch_type'].str.replace(",nan",'')
merchdf = merchcombo['merch_type'].to_frame()
#merchdf
### clean description column
brewdf['description'] = brewdf['description'].str.replace(r'<[^<>]*>', '', regex=True)
#brewdf
### replace nan with null
brewdf = brewdf.replace('nan',np.nan)
brewdf = brewdf.replace('None',np.nan)
brewdf
cleanedbrewdf = brewdf.drop(columns = {'food_type','tour_type','otherdrinks_type','articles','merch_type','togo_type','patio','visitors','parking','dogs'})
mergedbrewdf = pd.concat([cleanedbrewdf,dogsdf,parkdf,fooddf,patiodf,
visitordf,tourdf,odrinkdf,togodf,merchdf,],ignore_index=False,axis=1)
mergedbrewdf
### remove non-existing
finalbrewdf = mergedbrewdf.loc[(mergedbrewdf['lon'].notnull())].copy()
finalbrewdf['lon'] = finalbrewdf['lon'].astype(float)
finalbrewdf['lat'] = finalbrewdf['lat'].astype(float)
finalbrewdf
Can someone please point me in the right direction for the hours and memberships columns? Also, is there a more efficient way to look through these different columns? They have different nested list-dict lengths which I thought might prevent me from writing a function.
I have this categorical dataset in which I will like each category/group to be a layer I can turn ON/OFF.
Am able to add the groups on 'LayerControl', but it doesn't work properly as expected.
import folium
m = folium.Map(location=[39.712183, -104.998424], zoom_start=5)
data = [(36.314292, -117.517516, 'P1'),
(40.94859, -116.15316, 'P2'),
(34.14757, -119.81985, 'P3'),
(46.31292, -117.57516, 'P4'),
(41.04159, -116.15316, 'P2'),
(44.22093, -119.821985,'P2'),
(42.25308, -117.27589, 'P3'),
(41.60302, -115.97012, 'P4'),
(44.35519, -117.94183, 'P4'),
(44.02027, -117.22198, 'P1'),
(45.91613, -113.05364, 'P5'),
(48.17537, -117.90075, 'P1'),
(37.65961, -117.61321, 'P1')]
for x in range(0, len(data)):
point_layer = folium.FeatureGroup(name = latLong[x][2])
for lat,lng,nameP in latLong:
point_layer.add_child(folium.CircleMarker(location=[lat, lng], radius=10,
popup=str(nameP) + " Lat: " + str(lat) + " , Long: " + str(lng),
tooltip=str(nameP) + " Lat: " + str(lat) + " , Long: " + str(lng),
fill=True, # Set fill to True
color='red',
fill_opacity=1.0)).add_to(m)
m.add_child(point_layer)
m.add_child(folium.LayerControl(collapsed=False))
m.save("Map1.html")
As you see from above, the buttons are off yet the points/circles still display on the map. Any idea to fixing this?
To associate a marker with a layer, create a point group and set the marker to belong to it. You can then add the point group to the map. The code in the question creates a point group in a loop process, but we can change the process to create a point group in advance and then add it to the map.
import folium
m = folium.Map(location=[39.712183, -104.998424], zoom_start=5)
latLong = [(36.314292, -117.517516, 'P1'),
(40.94859, -116.15316, 'P2'),
(34.14757, -119.81985, 'P3'),
(46.31292, -117.57516, 'P4'),
(41.04159, -116.15316, 'P2'),
(44.22093, -119.821985,'P2'),
(42.25308, -117.27589, 'P3'),
(41.60302, -115.97012, 'P4'),
(44.35519, -117.94183, 'P4'),
(44.02027, -117.22198, 'P1'),
(45.91613, -113.05364, 'P5'),
(48.17537, -117.90075, 'P1'),
(37.65961, -117.61321, 'P1')]
# point_layer name list
all_gp = []
for x in range(len(latLong)):
pg = latLong[x][2]
all_gp.append(pg)
# Create point_layer object
unique_gp = list(set(all_gp))
vlist = []
for i,k in enumerate(unique_gp):
locals()[f'point_layer{i}'] = folium.FeatureGroup(name=k)
vlist.append(locals()[f'point_layer{i}'])
# Creating list for point_layer
pl_group = []
for n in all_gp:
for v in vlist:
if n == vars(v)['layer_name']:
pl_group.append(v)
for (lat,lng,nameP),pg in zip(latLong, pl_group):
folium.CircleMarker(location=[lat, lng], radius=10,
popup=str(nameP) + " Lat: " + str(lat) + " , Long: " + str(lng),
tooltip=str(nameP) + " Lat: " + str(lat) + " , Long: " + str(lng),
fill=True, # Set fill to True
color='red',
fill_opacity=1.0).add_to(pg)
pg.add_to(m)
m.add_child(folium.LayerControl(collapsed=False))
#m.save("Map1.html")
m
Dynamically create string from pandas column
I have three data frame like below one is df and another one is anomalies:-
d = {'10028': [0], '1058': [25], '20120': [29], '20121': [22],'20122': [0], '20123': [0], '5043': [0], '5046': [0]}
df1 = pd.DataFrame(data=d)
Basically anomalies in a mirror copy of df just in anomalies the value will be 0 or 1 which indicates anomalies where value is 1 and non-anomaly where value is 0
d = {'10028': [0], '1058': [1], '20120': [1], '20121': [0],'20122': [0], '20123': [0], '5043': [0], '5046': [0]}
df2 = pd.DataFrame(data=d)
And a third data frame like below:-
d = {'10028': ['US,IN'], '1058': ['NA, JO, US'], '20120': [''], '20121': ['US,PK'],'20122': ['IN'], '20123': ['Us,LN'], '5043': ['AI,AL'], '5046': ['AA,AB']}
df3 = pd.DataFrame(data=d)
and I am converting that into a specific format with the below code:-
details = (
'\n' + 'Metric Name' + '\t' + 'Count' + '\t' + 'Anomaly' + '\t' + 'Country'
'\n' + '10028:' + '\t'+ '\t' + str(df1.tail(1)['10028'][0]) + '\t' + str(df2['10028'][0]) + '\t'+ str(df3['10028'][0]) +
'\n' + '1058:' + '\t' + '\t' + str(df1.tail(1)['1058'][0]) + '\t' + str(df2['1058'][0]) + '\t'+ str(df3['1058'][0]) +
'\n' + '20120:' + '\t' +'\t' + str(df1.tail(1)['20120'][0]) + '\t' + str(df2['20120'][0]) + '\t'+ str(df3['20120'][0]) +
'\n' + '20121:' + '\t' + '\t' +str(round(df1.tail(1)['20121'][0], 2)) + '\t' + str(df2['20121'][0]) + '\t'+ str(df3['20121'][0]) +
'\n' + '20122:' + '\t' + '\t' +str(round(df1.tail(1)['20122'][0], 2)) + '\t' + str(df2['20122'][0]) + '\t'+str(df3['20122'][0]) +
'\n' + '20123:' + '\t' + '\t' +str(round(df1.tail(1)['20123'][0], 3)) + '\t' + str(df2['20123'][0]) + '\t'+str(df3['20123'][0]) +
'\n' + '5043:' + '\t' + '\t' +str(round(df1.tail(1)['5043'][0], 3)) + '\t' + str(df2['5043'][0]) + '\t'+str(df3['5043'][0]) +
'\n' + '5046:' + '\t' + '\t' +str(round(df1.tail(1)['5046'][0], 3)) + '\t' + str(df2['5046'][0]) + '\t'+str(df3['5046'][0]) +
'\n\n' + 'message:' + '\t' +
'Something wrong with the platform as there is a spike in [values where anomalies == 1].'
)
The problem is the column values are changing always in every run I mean like in this run its '10028', '1058', '20120', '20121', '20122', '20123', '5043', '5046' but maybe in next run it will be '10029', '1038', '20121', '20122', '20123', '5083', '5946'
How I can create the details dynamically depending on what columns are present in the data frame as I don't want to hard code and in the message I want to pass the name of columns whose value is 1.
The value of columns will always be either 1 or 0 for df1 and df2 and for df3 either a list or blank.
Expected Output:-
For two data frames I got a working solution which is below :-
# first part of the string
s = '\n' + 'Metric Name' + '\t' + 'Count' + '\t' + 'Anomaly'
# dynamically add the data
for idx, val in df1.iloc[-1].iteritems():
s += f'\n{idx}\t{val}\t{df2[idx][0]}'
# last part
s += ('\n\n' + 'message:' + '\t' +
'Something wrong with the platform as there is a spike in [values where anomalies == 1].'
)
and if the matching value is not present then print null
To obtain the expected result, you can do the following (the input data must be the dictionaries as shown in question, if not, please provide the real input data):
import pandas as pd
final_d = []
d = {'10028': 0, '1058': 25, '20120': 29, '20121': 22,'20122': 0, '20123': 0, '5043': 0, '5046': 0}
final_d.append(d)
d = {'10028': 0, '1058': 1, '20120': 1, '20121': 0,'20122': 0, '20123': 0, '5043': 0, '5046': 0, '91111':0}
final_d.append(d)
d = {'10028': ['US','IN'], '1058': ['NA', 'JO', 'US'], '20120': [''], '20121': ['US','PK'],'20122': ['IN'], '20123': ['Us','LN'], '5043': ['AI','AL'], '5046': ['AA','AB'], '00000':['kk','dd','ee']}
final_d.append(d)
# Now, we will merge the dictionaries on key
data = {}
for i, dt in enumerate(final_d):
for k,v in dt.items():
if k in data:
if type(v)==list:
data[k][i] = ','.join(v)
else:
data[k][i] = v
else:
data[k] = ['']*len(final_d)
if type(v)==list:
data[k][i] = ','.join(v)
else:
data[k][i] = v
maxlen = max([len(v) for v in data.values()])
data = {k:v if len(v)==maxlen else v+['']*(maxlen-len(v)) for k,v in data.items()}
# Creating the base dataframe
df = pd.DataFrame.from_dict(data)
# Converting the column headers (metric names) into a row in the dataframe
df = pd.concat([pd.DataFrame.from_dict({k:[v] for k,v in zip(df.columns.tolist(), df.columns.tolist())}), df], ignore_index=True)
# removing column names
df.columns = [''] * len(df.columns)
# organising the dataframe according to your required output
result = df.T.reset_index(drop=True)
# Adding the column names as required
result.columns = ['Metric Name', 'Count', 'Anomaly', 'Country']
# Voila!
print(result.to_string(index=False))
The generated dataframe:
Metric Name Count Anomaly Country
10028 0 0 US,IN
1058 25 1 NA,JO,US
20120 29 1
20121 22 0 US,PK
20122 0 0 IN
20123 0 0 Us,LN
5043 0 0 AI,AL
5046 0 0 AA,AB
91111 0
00000 kk,dd,ee
I am trying to scrape data from soccerway.com and checking whether the page is a completed game/game to be played with each instance being written to a seperate csv file. I am running through 10,000 pages and so have written it using Pools. However, I am getting empty lists from the append function and cannot write anything to the csv files.
I tried writing straight to the file instead of list appending however this gave incomplete files
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
import uuid
import time
from multiprocessing import Pool
import sys, os
fixturesA = []
linksA = []
statsA = []
def parse(url):
try:
#print(url)
delays = [0.25,0.5,0.75,1]
delay = np.random.choice(delays)
#time.sleep(delay)
#r = requests.get(url)
r = requests.get(url, timeout = 10)
soup = BeautifulSoup(r.content, "html.parser")
teams = soup.findAll('h3', attrs = {'class' : 'thick'})
homeTeam = teams[0].text.strip()
awayTeam = teams[2].text.strip()
middle = teams[1].text.strip()
dds = soup.findAll('dd')
date = dds[1].text.strip()
gameWeek = dds[2].text.strip()
if ':' not in middle:
middle = middle.split(" - ")
homeGoals = 0
awayGoals = 0
homeGoals = middle[0]
try:
awayGoals = middle[1]
except Exception as e:
homeGoals = "-1"
awayGoals = "-1"
matchGoals = int(homeGoals) + int(awayGoals)
if(matchGoals >= 0):
if(int(homeGoals) > 0 and int(awayGoals) > 0):
btts = "y"
else:
btts = "n"
halfTimeScore = dds[4].text.strip().split(" - ")
firstHalfHomeGoals = halfTimeScore[0]
firstHalfAwayConc = halfTimeScore[0]
firstHalfAwayGoals = halfTimeScore[1]
firstHalfHomeConc = halfTimeScore[1]
firstHalfTotalGoals = int(firstHalfHomeGoals) + int(firstHalfAwayGoals)
secondHalfHomeGoals = int(homeGoals) - int(firstHalfHomeGoals)
secondHalfAwayConc = int(homeGoals) - int(firstHalfHomeGoals)
secondHalfAwayGoals = int(awayGoals) - int(firstHalfAwayGoals)
secondHalfHomeConc = int(awayGoals) - int(firstHalfAwayGoals)
secondHalfTotalGoals = matchGoals - firstHalfTotalGoals
homeTeamContainers = soup.findAll('div', attrs = {'class' : 'container left'})
homeTeamStarting = homeTeamContainers[2]
homeTeamBench = homeTeamContainers[3]
homeTeamYellows = len(homeTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/YC.png' })) + len(homeTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/YC.png' }))
homeTeamReds = len(homeTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/RC.png' })) + len(homeTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/RC.png' }))
homeTeamCards = homeTeamYellows + homeTeamReds
awayTeamContainers = soup.findAll('div', attrs = {'class' : 'container right'})
awayTeamStarting = awayTeamContainers[2]
awayTeamBench = awayTeamContainers[3]
awayTeamYellows = len(awayTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/YC.png' })) + len(awayTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/YC.png' }))
awayTeamReds = len(awayTeamStarting.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/700/img/events/RC.png' })) + len(awayTeamBench.findAll('img', attrs = {'src' : 'https://s1.swimg.net/gsmf/699/img/events/RC.png' }))
awayTeamCards = awayTeamYellows + awayTeamReds
matchCards = homeTeamCards + awayTeamCards
try:
iframe = soup.findAll('iframe')
iframeSrc = iframe[1]['src']
url = 'https://us.soccerway.com/' + iframeSrc
c = requests.get(url,timeout = 10)
soupC = BeautifulSoup(c.content, "html.parser")
cornerContainer = soupC.findAll('td', attrs = {'class' : 'legend left value'})
homeCorners = cornerContainer[0].text.strip()
awayCornersConc = homeCorners
cornerContainer = soupC.findAll('td', attrs = {'class' : 'legend right value'})
awayCorners = cornerContainer[0].text.strip()
homeCornersConc = awayCorners
matchCorners = int(homeCorners) + int(awayCorners)
print("Got Score . " + homeTeam + " vs " + awayTeam+" . " + gameWeek )
statsA.append(homeTeam + "," + awayTeam + "," + gameWeek + "," + homeGoals + "," + awayGoals + "," + str(matchGoals) + "," + btts + "," + firstHalfHomeGoals + "," + firstHalfHomeConc + "," + firstHalfAwayGoals + "," + firstHalfAwayConc + "," + str(firstHalfTotalGoals) + "," + str(secondHalfHomeGoals) + "," + str(secondHalfHomeConc) + "," + str(secondHalfAwayGoals) + "," + str(secondHalfAwayConc) + "," + str(secondHalfTotalGoals) + "," + str(homeTeamCards) + "," + str(awayTeamCards) + "," + str(matchCards) + "," + homeCorners + "," + awayCorners + "," + homeCornersConc + "," + awayCornersConc + "," + str(matchCorners)+","+dds[0].text.strip() + "\n")
return None
except Exception as e:
print("Got Score no corners. " + homeTeam + " vs " + awayTeam+" . " + gameWeek + " NO FRAME")
statsA.append(homeTeam + "," + awayTeam + "," + gameWeek + "," + homeGoals + "," + awayGoals + "," + str(matchGoals) + "," + btts + "," + firstHalfHomeGoals + "," + firstHalfHomeConc + "," + firstHalfAwayGoals + "," + firstHalfAwayConc + "," + str(firstHalfTotalGoals) + "," + str(secondHalfHomeGoals) + "," + str(secondHalfHomeConc) + "," + str(secondHalfAwayGoals) + "," + str(secondHalfAwayConc) + "," + str(secondHalfTotalGoals) + "," + str(homeTeamCards) + "," + str(awayTeamCards) + "," + str(matchCards) + "," + "" + "," + "" + "," + "" + "," + "" + "," + ""+","+dds[0].text.strip() + "\n")
return None
else:
fixturesA.append(homeTeam + "," + awayTeam + "," + gameWeek + "," + date + "\n")
linksA.append(url + "\n")
print(homeTeam + " vs " + awayTeam + " at " + middle + " GW:" + gameWeek)
return None
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
linksA.append(url + "\n")
print(url)
return None
stats = open('Statsv2.csv','a',encoding='utf-8')
fixtures = open('fixturesv2.csv','w',encoding='utf-8')
with open('links.txt') as f:
content = f.readlines()
content = [x.strip() for x in content]
links = open('links.txt','w')
if __name__ == '__main__':
start_time = time.time()
p = Pool(20) # Pool tells how many at a time
records = p.map(parse, content)
p.terminate()
p.join()
print("--- %s seconds ---" % (time.time() - start_time))
I assume you are running Windows? Then the answer is that multi-processing in Windows creates copies instead of forks. So you have your main process with the lists and you get your working processes (from pool) with their own separate set of lists.
The workers most likely fill their list correctly, but the lists in the main-process don't get any data and so are staying empty. And the workers do not return anything. So, as you write your files in the main-process, you get empty files.
An easy way to solve this is creating pipes or queues between the main process and the workers to allow communication between the threads. You could also use shared arrays like they are provided by the multiprocessing class, but than you would need to know the length during creation.
see documentation: Multiprocessing
as pointed out by #RaJa you're not actually doing anything the parent/controlling process can see. the easiest is just to return values from the mapped function
for example, parse() could return tuple at the end like:
def parse(url):
# do work
return url, homeTeam, awayTeam, gameWeek, homeGoals, awayGoals # ...
then the parent process can receive the values and do useful things like saving them to a CSV file:
import csv
with Pool(20) as pool:
records = pool.map(parse, content)
with open('stats.csv', 'w') as fd:
out = csv.writer(fd)
out.writerow([
'url', 'hometeam', 'awayteam',
# and the remaining column names for the header
])
out.writerows(records)