Dynamically create string from pandas column
I have three data frame like below one is df and another one is anomalies:-
d = {'10028': [0], '1058': [25], '20120': [29], '20121': [22],'20122': [0], '20123': [0], '5043': [0], '5046': [0]}
df1 = pd.DataFrame(data=d)
Basically anomalies in a mirror copy of df just in anomalies the value will be 0 or 1 which indicates anomalies where value is 1 and non-anomaly where value is 0
d = {'10028': [0], '1058': [1], '20120': [1], '20121': [0],'20122': [0], '20123': [0], '5043': [0], '5046': [0]}
df2 = pd.DataFrame(data=d)
And a third data frame like below:-
d = {'10028': ['US,IN'], '1058': ['NA, JO, US'], '20120': [''], '20121': ['US,PK'],'20122': ['IN'], '20123': ['Us,LN'], '5043': ['AI,AL'], '5046': ['AA,AB']}
df3 = pd.DataFrame(data=d)
and I am converting that into a specific format with the below code:-
details = (
'\n' + 'Metric Name' + '\t' + 'Count' + '\t' + 'Anomaly' + '\t' + 'Country'
'\n' + '10028:' + '\t'+ '\t' + str(df1.tail(1)['10028'][0]) + '\t' + str(df2['10028'][0]) + '\t'+ str(df3['10028'][0]) +
'\n' + '1058:' + '\t' + '\t' + str(df1.tail(1)['1058'][0]) + '\t' + str(df2['1058'][0]) + '\t'+ str(df3['1058'][0]) +
'\n' + '20120:' + '\t' +'\t' + str(df1.tail(1)['20120'][0]) + '\t' + str(df2['20120'][0]) + '\t'+ str(df3['20120'][0]) +
'\n' + '20121:' + '\t' + '\t' +str(round(df1.tail(1)['20121'][0], 2)) + '\t' + str(df2['20121'][0]) + '\t'+ str(df3['20121'][0]) +
'\n' + '20122:' + '\t' + '\t' +str(round(df1.tail(1)['20122'][0], 2)) + '\t' + str(df2['20122'][0]) + '\t'+str(df3['20122'][0]) +
'\n' + '20123:' + '\t' + '\t' +str(round(df1.tail(1)['20123'][0], 3)) + '\t' + str(df2['20123'][0]) + '\t'+str(df3['20123'][0]) +
'\n' + '5043:' + '\t' + '\t' +str(round(df1.tail(1)['5043'][0], 3)) + '\t' + str(df2['5043'][0]) + '\t'+str(df3['5043'][0]) +
'\n' + '5046:' + '\t' + '\t' +str(round(df1.tail(1)['5046'][0], 3)) + '\t' + str(df2['5046'][0]) + '\t'+str(df3['5046'][0]) +
'\n\n' + 'message:' + '\t' +
'Something wrong with the platform as there is a spike in [values where anomalies == 1].'
)
The problem is the column values are changing always in every run I mean like in this run its '10028', '1058', '20120', '20121', '20122', '20123', '5043', '5046' but maybe in next run it will be '10029', '1038', '20121', '20122', '20123', '5083', '5946'
How I can create the details dynamically depending on what columns are present in the data frame as I don't want to hard code and in the message I want to pass the name of columns whose value is 1.
The value of columns will always be either 1 or 0 for df1 and df2 and for df3 either a list or blank.
Expected Output:-
For two data frames I got a working solution which is below :-
# first part of the string
s = '\n' + 'Metric Name' + '\t' + 'Count' + '\t' + 'Anomaly'
# dynamically add the data
for idx, val in df1.iloc[-1].iteritems():
s += f'\n{idx}\t{val}\t{df2[idx][0]}'
# last part
s += ('\n\n' + 'message:' + '\t' +
'Something wrong with the platform as there is a spike in [values where anomalies == 1].'
)
and if the matching value is not present then print null
To obtain the expected result, you can do the following (the input data must be the dictionaries as shown in question, if not, please provide the real input data):
import pandas as pd
final_d = []
d = {'10028': 0, '1058': 25, '20120': 29, '20121': 22,'20122': 0, '20123': 0, '5043': 0, '5046': 0}
final_d.append(d)
d = {'10028': 0, '1058': 1, '20120': 1, '20121': 0,'20122': 0, '20123': 0, '5043': 0, '5046': 0, '91111':0}
final_d.append(d)
d = {'10028': ['US','IN'], '1058': ['NA', 'JO', 'US'], '20120': [''], '20121': ['US','PK'],'20122': ['IN'], '20123': ['Us','LN'], '5043': ['AI','AL'], '5046': ['AA','AB'], '00000':['kk','dd','ee']}
final_d.append(d)
# Now, we will merge the dictionaries on key
data = {}
for i, dt in enumerate(final_d):
for k,v in dt.items():
if k in data:
if type(v)==list:
data[k][i] = ','.join(v)
else:
data[k][i] = v
else:
data[k] = ['']*len(final_d)
if type(v)==list:
data[k][i] = ','.join(v)
else:
data[k][i] = v
maxlen = max([len(v) for v in data.values()])
data = {k:v if len(v)==maxlen else v+['']*(maxlen-len(v)) for k,v in data.items()}
# Creating the base dataframe
df = pd.DataFrame.from_dict(data)
# Converting the column headers (metric names) into a row in the dataframe
df = pd.concat([pd.DataFrame.from_dict({k:[v] for k,v in zip(df.columns.tolist(), df.columns.tolist())}), df], ignore_index=True)
# removing column names
df.columns = [''] * len(df.columns)
# organising the dataframe according to your required output
result = df.T.reset_index(drop=True)
# Adding the column names as required
result.columns = ['Metric Name', 'Count', 'Anomaly', 'Country']
# Voila!
print(result.to_string(index=False))
The generated dataframe:
Metric Name Count Anomaly Country
10028 0 0 US,IN
1058 25 1 NA,JO,US
20120 29 1
20121 22 0 US,PK
20122 0 0 IN
20123 0 0 Us,LN
5043 0 0 AI,AL
5046 0 0 AA,AB
91111 0
00000 kk,dd,ee
Related
I would like to pass 'n' amount of cities to travel to and corresponding days in each city to a function that would return a df with all possible permutations of the journey. The kayak_search_url column in the df should contain this string in the first row:
https://www.kayak.com/flights/AMS-WAW,nearby/2023-02-14/WAW-BOG,nearby/2023-02-17/BOG-MIL,nearby/2023-02-20/MIL-SDQ,nearby/2023-02-23/SDQ-AMS,nearby/2023-02-25/?sort=bestflight_a
...but instead contains this string:
https://www.kayak.com/flights/AMS-WAW,nearby/2023-02-14/AMS-BOG,nearby/2023-02-17/AMS-MIL,nearby/2023-02-20/AMS-SDQ,nearby/2023-02-23/AMS,nearby/2023-02-25/?sort=bestflight_a
I can't figure out why the origin code 'AMS' shows up instead of the chain of cities. Here's the code:
# List the cities you want to travel to and from, how long you'd like to stay in each, and the appropriate start/end dates
start_city = 'Amsterdam'
end_city = 'Amsterdam'
start_date = '2023-02-14'
cities = ['Warsaw', 'Bogota', 'Milan', 'Santo Domingo']
days = [3,3,3,2]
def generate_permutations(cities, days, start_city, end_city, start_date):
city_to_days = dict(zip(cities, days))
permutations = list(itertools.permutations(cities))
df = pd.DataFrame(permutations, columns=['city' + str(i) for i in range(1, len(cities) + 1)])
df['origin'] = start_city
df['end'] = end_city
first_column = df.pop('origin')
df.insert(0, 'origin', first_column)
st_dt = pd.to_datetime(start_date)
df = df.assign(flight_dt_1=st_dt)
for i in range(len(cities)):
df['flight_dt_' + str(i + 2)] = df['flight_dt_' + str(i + 1)] + df['city' + str(i + 1)].map(city_to_days).map(lambda x: pd.Timedelta(days=x))
# IATA city code dictionary from iata_code.csv file in repo and create Kayak 'url' column for each permutation
iata = {'Amsterdam': 'AMS',
'Warsaw': 'WAW',
'Bogota': 'BOG',
'Milan': 'MIL',
'Santo Domingo': 'SDQ'}
url = 'https://www.kayak.com/flights/'
df['kayak_search_url'] = df.apply(lambda x: url + ''.join([iata[x['origin']] + '-' + iata[x['city' + str(i+1)]] + \
',nearby/' + str(x['flight_dt_' + str(i+1)].strftime("%Y-%m-%d")) + '/' \
for i in range(len(cities))]) + iata[x['end']] + ',nearby/' + str(x['flight_dt_' + str(len(cities) + 1)].strftime("%Y-%m-%d")) + \
'/?sort=bestflight_a', axis=1)
return df
Let's break down the desired URL to highlight its structure:
https://www.kayak.com/flights
/AMS-WAW,nearby/2023-02-14
/WAW-BOG,nearby/2023-02-17
/BOG-MIL,nearby/2023-02-20
/MIL-SDQ,nearby/2023-02-23
/SDQ-AMS,nearby/2023-02-25
/?sort=bestflight_a
Obviously only the middle section needs to generated as the other parts are static. We can also generate that middle section before constructing the dataframe:
def generate_permutations(cities, days, start_city, end_city, start_date):
iata = {
"Amsterdam": "AMS",
"Warsaw": "WAW",
"Bogota": "BOG",
"Milan": "MIL",
"Santo Domingo": "SDQ",
}
permutations = [
(start_city,) + p + (end_city,) for p in itertools.permutations(cities)
]
flight_dates = pd.to_datetime(start_date) + pd.to_timedelta(
np.array([0] + days).cumsum(),
unit="D",
)
# Generate the URLs
urls = []
for p in permutations:
# The pattern for each segment is
# START-END,nearby/yyyy-dd-dd
mid_url = "/".join(
[
f"{iata[s]}-{iata[e]},nearby/{fd:%Y-%m-%d}"
for s, e, fd in zip(p[:-1], p[1:], flight_dates)
]
)
urls.append(f"https://www.kayak.com/flights/{mid_url}/?sort=bestflight_a")
# Generate the resulting dataframe
return (
pd.DataFrame(
permutations,
columns=["origin", *[f"city{i+1}" for i in range(len(cities))], "end"],
)
.merge(
pd.DataFrame(
flight_dates,
index=[f"flight_dt_{i+1}" for i in range(len(flight_dates))],
).T,
how="cross",
)
.assign(kayak_search_url=urls)
)
I have a problem.
After fetching my .csv file in Python I keep getting the following error:
ValueError: Location values cannot contain NANs.
My code looks like this:
df = pd.read_csv("surveyed.csv")
fc=folium.FeatureGroup(name="Tbs",overlay=True)
cf_survey_cluster = MarkerCluster(name="Tbs").add_to(map)
for i,row in df.iterrows():
city = df.at[i,'City']
address = df.at[i,'Address']
postcode = df.at[i,'Post Code']
dead = df.at[i,'Deadline']
lat = df.at[i, 'Latitude']
lng = df.at[i, 'Longitude']
popup = '<b>CITY: </b>' + str(city) + '<br>' + '<b>ADDRESS: </b>' + str(address) + ', ' + str(postcode) + '<br>' + '<b>DEADLINE: </b>' + str(dead)
cf_survey_marker = folium.Marker(location=[lat,lng], popup=popup, icon = folium.Icon(color='green', icon='glyphicon-calendar'))
My .csv file is fine, no gaps seen at all.
I tried the following query:
ValueError: Location values cannot contain NaNs, got: [nan, nan]
but I don't know how to provide the isnull option to my code. I tried:
lat = df[df.isnull(at[i, 'Latitude'])]
but now the error shows:
The value at is not defined.
Is there any chance to make it fixed?
UPDATE:
This approach also doesn't work:
df = pd.read_csv("surveyed.csv")
fc=folium.FeatureGroup(name="To be surveyed",overlay=True)
cf_survey_cluster = MarkerCluster(name="To be surveyed").add_to(map)
for i,row in df.iterrows():
city = df.at[i,'City']
address = df.at[i,'Address']
postcode = df.at[i,'Post Code']
dead = df.at[i,'Deadline']
#lat = df.at[i, 'Latitude']
#lng = df.at[i, 'Longitude']
latlon = df.dropna(subset=['Longitude','Latitude'])
popup = '<b>CITY: </b>' + str(city) + '<br>' + '<b>ADDRESS: </b>' +
str(address) + ', ' + str(postcode) + '<br>' + '<b>DEADLINE: </b>' +
str(dead)
cf_survey_marker = folium.Marker(location=[latlon], popup=popup, icon = folium.Icon(color='green', icon='glyphicon-calendar'))
as I get an error:
ValueError: Expected two (lat, lon) values for location, instead got: [ City Address ... Latitude Longitude
You can use the dropna() function to remove nan values from columns.
df.dropna(axis='columns')
Example:
df = df.dropna(subset=['Longitude','Latitude'])
df = pd.read_csv("surveyed.csv")
df = df.dropna(subset=['Longitude','Latitude'])
fc=folium.FeatureGroup(name="To be surveyed",overlay=True)
cf_survey_cluster = MarkerCluster(name="To be surveyed").add_to(map)
for i,row in df.iterrows():
city = df.at[i,'City']
address = df.at[i,'Address']
postcode = df.at[i,'Post Code']
dead = df.at[i,'Deadline']
lat = df.at[i, 'Latitude']
lng = df.at[i, 'Longitude']
popup = '<b>CITY: </b>' + str(city) + '<br>' + '<b>ADDRESS: </b>' +
str(address) + ', ' + str(postcode) + '<br>' + '<b>DEADLINE: </b>' +
str(dead)
cf_survey_marker = folium.Marker(location=[lat, lon], popup=popup, icon = folium.Icon(color='green', icon='glyphicon-calendar'))
I am trying to store the values obtained from excel sheet cells to a list. The code provided basically collects data from different continuous rows and columns and creates a string of those values. I could work upt o storing the string value but I don't really know how to store the strings in a list, Can anyone help me with this?
for i in range(NR):
print("This TC checks the output for")
for j in range(NC):
inputVariable = str(ws[get_column_letter(ColumnStart+j) + str(rowStart-1)].value)
c = str((ws.cell(row = (rowStart + i),column = (ColumnStart +j)).value))
if (ws.cell(row = (rowStart + i),column = (ColumnStart+j)).value) == (ws.cell(row = (MaxValRow),column = (ColumnStart+j)).value):
b = '(maximum)'
elif (ws.cell(row = (rowStart + i),column = (ColumnStart+j)).value) == (ws.cell(row = (MinValRow),column = (ColumnStart+j)).value):
b = '(minimum)'
else:
b ='(intermediate)'
Commentstr = str(j+1) + '. The value of input ' + inputVariable + ' =' + " " + c + b
# need to create a list here to store the commentstr for each iteration
NR = no. of rows, NC = no. of columns
my_list=[]
for i in range(NR):
x=0
print("This TC checks the output for")
for j in range(NC):
inputVariable = str(ws[get_column_letter(ColumnStart+j) + str(rowStart-1)].value)
c = str((ws.cell(row = (rowStart + i),column = (ColumnStart +j)).value))
if (ws.cell(row = (rowStart + i),column = (ColumnStart+j)).value) == (ws.cell(row = (MaxValRow),column = (ColumnStart+j)).value):
b = '(maximum)'
elif (ws.cell(row = (rowStart + i),column = (ColumnStart+j)).value) == (ws.cell(row = (MinValRow),column = (ColumnStart+j)).value):
b = '(minimum)'
else:
b ='(intermediate)'
Commentstr = str(j+1) + '. The value of input ' + inputVariable + ' =' + " " + c + b
my_list[x]=Commentstr
x+=1
I'm scraping data for a data science project using Selenium, and I don't know why I get Index errors on the write-to-csv portion. When I print out the data as-is, the output looks normal.
Code below:
'''
driver = webdriver.Firefox(executable_path="/filepath/geckodriver.exe")
url = 'https://website.com'
driver.get(url)
with open('file.csv', 'w') as f:
f.write('Column1', 'Column2', 'Column3', '\n')
ids = driver.find_elements_by_xpath('//*[#class="id-name"]')
id_list = []
for i in range(50):
id_list.append(ids[i].text)
print(len(ids))
print(len(id_list))
print(id_list[0:50])
# Break up into batches to save memory
new_id_list = [id_list[i:i+5] for i in range(0,len(id_list),5)]
#time.sleep(1200)
for i in range(len(new_id_list)):
for j in range(len(new_id_list[i])):
url = 'http://www.website.com?id=' + str(id_list[j])
driver.get(url)
col1 = driver.find_elements_by_xpath('//*[#id="field-value-col_1"]/span/span')
col2 = driver.find_elements_by_xpath('//h1[#id="field-value-col_2"]')
col3 = driver.find_elements_by_xpath('//*[#id="field-value-col_3"]')
print(id_list[i][j] + ',' + col1[0].text + ',' + col2[0].text + ',' + col3[0].text, '\n')
# This is where I get the error usually.
with open('bugzilla.csv', 'w') as f:
f.write(id_list[i][j] + ',' + col1[0].text + ',' + col2[0].text + ',' + col3[0].text, '\n')
print('Batch of 5')
f.close()
'''
Here
print(id_list[i][j] + ',' + col1[0].text + ',' + col2[0].text + ',' + col3[0].text, '\n')
you work with your id_list as with two-dimensional array while earlier you define it as
id_list = []
for i in range(50):
id_list.append(ids[i].text)
You probably meant: print(new_id_list[i][j] + ',' + col1[0].text + ',' + col2[0].text + ',' + col3[0].text, '\n')
I'm trying to use this dictionary:
student_data_dict = {'Student_1': 'bbbeaddacddcddaaadbaabdad', 'Student_2': 'acbccaddcadaaacdadbcabcad', 'Student_3': 'babcabdccadcDdbccdbaadbad', 'Student_4': 'bcbcabddcadcdabccdbaadcbd', 'Student_5': 'DCBCCADDCADBDACCDBBACBCAD', 'Student_6': 'acbeccddcadbaaccabbacdcad', 'Student_7': 'BCBCBCDABADCADCCDABAACCAD', 'Student_8': 'dcbccbddcadaabcbcacabbcad', 'Student_9': 'DDBDBBCDDCCBABCCBACADAAAC', 'Student_10': 'cbbdacdacadcbadbabaabcaTa', 'Student_11': 'BDBECADCAADCAAAAACBACACAD', 'Student_12': 'DBBCCBDCCADCDABABCBAABCAD', 'Student_13': 'BCBCBCDDCADCAAACCABACACAD', 'Student_14': 'DBBECBDACADAAACBCBAAABCBD', 'Student_15': 'acbebbddcadbaacccbcaddcad', 'Student_16': 'ACBEBCDDCADBAACCAACADBCAD', 'Student_17': 'DBBCACDDCADCAABCADBABDDAD', 'Student_18': 'dcbcdcdbbddccabbdacacccbd', 'Student_19': 'dbbccbddcadaaaccbdcaaacad', 'Student_20': 'abbdaaddcadcaaccbdcaaccbd', 'Student_21': 'DCDCABDBCADAAACDCCDAACAAD', 'Student_22': 'dabdaddabddbaacdacbaaaaad', 'Student_23': 'BCBCDDDACCDCAABDDABACACAD', 'Student_24': 'ACBDCBDBBCDAACCCCBDAADCBD', 'Student_25': 'DCBCACDAADDCADCBAABACBCAD', 'Student_26': 'dcbaabdccadcdadcccbaabdbd', 'Student_27': 'abbadbddcadacbcacccacbdad'}
and store the first letter for all students as a dictionary entry and then do the same for the next letter ect... to result in:
{'question_1': 'babbDaBdDcBDBDaADddaDdBADda', 'question_2': 'bcacCcCcDbDBCBcCBcbbCaCCCcb', 'question_3': 'bbbbBbBbBbBBBBbBBbbbDbBBBbb', 'question_4': 'ecccCeCcDdECCEeECccdCdCDCaa', 'question_5': 'acaaCcBcBaCCBCbBAdcaAaDCAad', 'question_6': 'dabbAcCbBcABCBbCCcbaBdDBCbb', 'question_7': 'ddddDdDdCdDDDDdDDdddDdDDDdd', 'question_8': 'adcdDdAdDaCCDAdDDbddBaABAcd', 'question_9': 'ccccCcBcDcACCCcCCbccCbCBAcc', 'question_10': 'daaaAaAaCaAAAAaAAdaaAdCCDaa', 'question_11': 'ddddDdDdCdDDDDdDDdddDdDDDdd', 'question_12': 'caccBbCaBcCCCAbBCcacAbCACca', 'question_13': 'daDdDaAaAbADAAaAAcaaAaAAAdc', 'question_14': 'dadaAaDbBaAAAAaAAaaaAaACDab', 'question_15': 'acbbCcCcCdABACcCBbccCcBCCdc', 'question_16': 'adccCcCbCbAACBcCCbccDdDCBca', 'question_17': 'aaccDaDcBaABCCcAAdbbCaDCAcc', 'question_18': 'ddddBbAaAbCCABbADaddCcABAcc', 'question_19': 'bbbbBbBcCaBBBAcCBcccDbBDBbc', 'question_20': 'acaaAaAaAaAAAAaAAaaaAaAAAaa', 'question_21': 'aaaaCcAbDbCACAdDBcaaAaCACac', 'question_22': 'bbddBdCbAcABABdBDcacCaADBbb', 'question_23': 'dcbcCcCcAaCCCCcCDcccAaCCCdd', 'question_24': 'aaabAaAaATAAABaAAbabAaABAba', 'question_25': 'ddddDdDdCaDDDDdDDdddDdDDDdd'}
x = 1
all_letters = ''
letter = ''
y = 1
i = 0
z = 0
for start in student_data_dict:
student = student_data_dict.get('Student_' + str(y))
letter = student[z]
all_letters = all_letters + letter
y = y + 1
i = i + 1
question_data_dict["question " + str(x)] = all_letters
if i == 27:
z = z + 1
x = x + 1
i = 0
print(question_data_dict)
data_file.close()
{'question 1': 'babbDaBdDcBDBDaADddaDdBADda'}
is what I get but I can't get the answers for the other 25 questions.
I tried changing for start in student_data_dict: into while z<26: but at the line "letter = student[z]" I get the error "'NoneType' object is not subscriptable"
num_questions = 25
answers_dict = {}
for i in range(num_questions):
answers_dict['question' + str(i)] = ''.join(c[i] for c in student_data_dict.values())
print(answers_dict)
Will give you the result you want.
Edit
Fixed code. Extracted number of questions to a variable so it can be used as index
Edit2
I created an OrderedDict from your original dictionary to maintain answer order when iterating. Now the answers_dict contains valid data.
from collections import OrderedDict
ordered_data = OrderedDict()
for i in range(len(student_data_dict.items())):
ordered_data['Student_' + str(i + 1)] = student_data_dict.get('Student_' + str(i + 1))
num_questions = 25
answers_dict = {}
for i in range(num_questions):
answers_dict['question' + str(i + 1)] = ''.join(c[i] for c in ordered_data.values())
You need to reset y when you move to the next question.
Here's an alternative a way to get what you're looking for with Pandas:
import pandas as pd
sdd = {k:[x for x in v] for k,v in student_data_dict}
df = pd.DataFrame(sdd)
df = df.reindex_axis(sorted(df.columns,
key = lambda col: int(col.split("_")[-1])), axis=1)
df.index = [f"Question {i+1}" for i in df.index]
{k:''.join(v) for k,v in zip(df.index, df.values)}