Pandas retrieve data with a specific order FAST - python

I would like to get a number of entries at once in a specific order of an ID column values. To make things more complicated, as input I have rows with ID1 and ID2, and for each row either ID1 or ID2 is in the table but not both.
The IDs are all unique.
import pandas as pd
import numpy as np
print('Generating table and matchTable...')
N = 10000
# General unique IDs list to draw from
ids = np.random.choice(a=list(range(N*100)), replace=False, size=N*10)
# First N ids go into MAIN_IDS
mainIDs = ids[:N]
data = np.random.randint(low=0, high=25, size=N)
table = pd.DataFrame({'MAIN_IDS': mainIDs, 'DATA':data})
# These ids exist in the table as MAIN_IDS
tableIdsList = np.random.choice(mainIDs, replace=False, size=int(N/10))
notInTableIdsList = ids[N:N+int(N/10)]
idsA = np.zeros(shape=(int(N/10)), dtype=np.int)
idsB = np.zeros(shape=(int(N/10)), dtype=np.int)
for i in range(len(idsA)):
if np.random.random()>0.4:
idsA[i] = tableIdsList[i]
idsB[i] = notInTableIdsList[i]
else:
idsA[i] = notInTableIdsList[i]
idsB[i] = tableIdsList[i]
matchTable = pd.DataFrame({'ID1': idsA, 'ID2':idsB})
print(' Done!')
print('Generating the correct result...')
correctResult = []
for i in range(len(tableIdsList)):
correctResult.append(data[np.where(mainIDs==tableIdsList[i])[0][0]])
correctResult = np.array(correctResult)
print(' Done!')
I want to get DATA, where MAIN_ID==ID1 or ID2, but in the order of the matchTable.

First filter your match table by the Id from table , then we using reindex
idx=matchTable.where(matchTable.isin(table.MAIN_IDS.tolist())).stack()
table=table.set_index('MAIN_IDS').reindex(idx).reset_index()

Related

How do I find the uniques and the count of rows for multiple columns?

I have a file with 136 columns. I was trying to find the unique values of each column and from there, I need to find the number of rows for the unique values.
I tried using df and dict for the unique values. However, when I export it back to csv file, the unique values are exported as a list in one cell for each column.
Is there any way I can do to simplify the counting process of the unique values in each column?
df = pd.read_excel(filename)
column_headers = list(df.columns.values)
df_unique = {}
df_count = {}
def approach_1(data):
count = 0
for entry in data:
if not entry =='nan' or not entry == 'NaN':
count += 1
return count
for unique in column_headers:
new = df.drop_duplicates(subset=unique , keep='first')
df_unique[unique] = new[unique].tolist()
csv_unique = pd.DataFrame(df_unique.items(), columns = ['Data Source Field', 'First Row'])
csv_unique.to_csv('Unique.csv', index = False)
for count in df_unique:
not_nan = approach_1(df_unique[count])
df_count[count] = not_nan
csv_count = pd.DataFrame(df_count.items(), columns = ['Data Source Field', 'Count'])
.unique() is simpler ->len(df[col].unique()) is the count
import pandas as pd
dict = [
{"col1":"0","col2":"a"},
{"col1":"1","col2":"a"},
{"col1":"2","col2":"a"},
{"col1":"3","col2":"a"},
{"col1":"4","col2":"a"},
{"col2":"a"}
]
df = pd.DataFrame.from_dict(dict)
result_dict = {}
for col in df.columns:
result_dict[col] = len(df[col].dropna().unique())
print(result_dict)

Looking for a more elegant and sophisticated solution when multiple if and for-loop are used

I am beginner/intermediate user working with python and when I write elaborate code (at least for me), I always try to rewrite it looking for reducing the number of lines when possible.
Here the code I have written.
It is basically read all values of one data frame looking for a specific string, if string found save index and value in a dictionary and drop rows where these string was found. And the same with next string...
##### Reading CSV file values and looking for variants IDs ######
# Find Variant ID (rs000000) in CSV
# \d+ is neccesary in case the line find a rs+something. rs\d+ looks for rs+ numbers
rs = df_draft[df_draft.apply(lambda x:x.str.contains("rs\d+"))].dropna(how='all').dropna(axis=1, how='all')
# Now, we save the results found in a dict key=index and value=variand ID
if rs.empty == False:
ind = rs.index.to_list()
vals = list(rs.stack().values)
row2rs = dict(zip(ind, vals))
print(row2rs)
# We need to remove the row where rs has been found.
# Because if in the same row more than one ID variant found (i.e rs# and NM_#)
# this code is going to get same variant more than one.
for index, rs in row2rs.items():
# Rows where substring 'rs' has been found need to be delete to avoid repetition
# This will be done in df_draft
df_draft = df_draft.drop(index)
## Same thing with other ID variants
# Here with Variant ID (NM_0000000) in CSV
NM = df_draft[df_draft.apply(lambda x:x.str.contains("NM_\d+"))].dropna(how='all').dropna(axis=1, how='all')
if NM.empty == False:
ind = NM.index.to_list()
vals = list(NM.stack().values)
row2NM = dict(zip(ind, vals))
print(row2NM)
for index, NM in row2NM.items():
df_draft = df_draft.drop(index)
# Here with Variant ID (NP_0000000) in CSV
NP = df_draft[df_draft.apply(lambda x:x.str.contains("NP_\d+"))].dropna(how='all').dropna(axis=1, how='all')
if NP.empty == False:
ind = NP.index.to_list()
vals = list(NP.stack().values)
row2NP = dict(zip(ind, vals))
print(row2NP)
for index, NP in row2NP.items():
df_draft = df_draft.drop(index)
# Here with ClinVar field (RCV#) in CSV
RCV = df_draft[df_draft.apply(lambda x:x.str.contains("RCV\d+"))].dropna(how='all').dropna(axis=1, how='all')
if RCV.empty == False:
ind = RCV.index.to_list()
vals = list(RCV.stack().values)
row2RCV = dict(zip(ind, vals))
print(row2RCV)
for index, NP in row2NP.items():
df_draft = df_draft.drop(index)
I was wondering for a more elegant solution of writing this simple but long code.
I have been thinking of sa

How to loop over multiple subsets, perform operations and take the results to the original dataframe in python?

I have a dataframe with millions of rows, and about 100k unique ID numbers. I want to perform operations per unique ID. For now I generate a subset per unique ID and perform some operations accordingly. This loops works. But how do I efficiently combine the subsets into one dataframe?
Maybe there is a more efficient way to perform operations per subset of unique IDs.
Thanks
for ID in np.unique(df_fin['ID']):
ID_subset = df_fin.loc[df_fin['ID'] == ID]
for i in ID_subset.index:
if ID_subset['date_diff'][i] > 0:
for p in range(0,ID_subset['date_diff'][i]):
if p == WIP:
sl.appendleft(ID_subset.return_bin[i-1])
else:
sl.appendleft(0)
lissa = list(sl)
ID_subset.at[i,'list_stock'] = lissa
frames = [ID_subset] #this does not work
final_mod = pd.concat(frames) #this also does not work
THIS IS WORKING:
I also tried with groupby.apply. See the code below.
def create_stocklist(x):
x['date_diff'] = x['dates'] - x['dates'].shift()
x['date_diff'] = x['date_diff'].fillna(0)
x['date_diff'] = (x['date_diff'] / np.timedelta64(1, 'D')).astype(int)
x['list_stock'] = x['list_stock'].astype(object)
x['stock_new'] = x['stock_new'].astype(object)
var_stock = DOS*[0]
sl = deque([0],maxlen=DOS)
for i in x.index:
if x['date_diff'][i] > 0:
for p in range(0,x['date_diff'][i]):
if p == WIP:
sl.appendleft(x.return_bin[i-1])
else:
sl.appendleft(0)
lissa = list(sl)
x.at[i,'list_stock'] = lissa
return x
df_fin.groupby(by=['ID']).apply(create_stocklist)
An approach could be:
for g, _id in df_din.groupby(by=['ID']):
# do stuff with g
g is a dataframe containing all rows such that df_fin['ID'] == _id

Iterating over multiple pandas dataframe is slow

I'm trying to find the number of similar words for all rows in Dataframe1 for every single row with words in Dataframe 2.
Based on the similarities I want to create a new data frame with where columns = N rows of dataframe2
values = similarity.
My current code is working, but it runs very slow. I'm not sure how to optimize it...
df = pd.DataFrame([])
for x in range(10000):
save = {}
terms_1 = data['text_tokenized'].iloc[x]
save['code'] = data['code'].iloc[x]
for y in range(3000):
terms_2 = data2['terms'].iloc[y]
similar_n = len(list(terms_2.intersection(terms_1)))
save[data2['code'].iloc[y]] = similar_n
df = df.append(pd.DataFrame([save]))
Update: new code (still running slow)
def get_sim(x, terms):
similar_n = len(list(x.intersection(terms)))
return similar_n
for index in icd10_terms.itertuples():
code,terms = index[1],index[2]
data[code] = data['text_tokenized'].apply(get_sim, args=(terms,))

Nested for loops for Large Datasets using Pandas

I am working on a data analysis and I have to generate Histograms. My code has more than 7 nested for-loops. Each nested loop filters the data frame by a unique value from the category to form a new data frame of sub categories and then splitting further like previous. Each day has around 400,000 records. And I have to process last 30 days record. The result is to produce histograms for the values(only one numerical column) of the last un-splittable category. How do I reduce complexity? Any alternate methods ?
for customer in data_frame['MasterCustomerID'].unique():
df_customer = data_frame.loc[data_frame['MasterCustomerID'] == customer]
for service in df_customer['Service'].unique():
df_service = df_customer.loc[df_customer['Service'] == service]
for source in df_service['Source'].unique():
df_source = df_service.loc[df_service['Source'] == source]
for subcomponent in df_source['SubComponentType'].unique():
df_subcomponenttypes = df_source.loc[df_source['SubComponentType'] == subcomponent]
for kpi in df_subcomponenttypes['KPI'].unique():
df_kpi = df_subcomponenttypes.loc[df_subcomponenttypes['KPI'] == kpi]
for device in df_kpi['Device_Type'].unique():
df_device_type = df_kpi.loc[df_kpi['Device_Type'] == device]
for access in df_device_type['Access_type'].unique():
df_access_type = df_device_type.loc[df_device_type['Access_type'] == access]
df_access_type['Day'] = ifweekday(df_access_type['PerformanceTimeStamp'])
You can use pandas.groupby to find unique combinations of different levels of the columns (see here and here) and then loop over the dataframe grouped by each combination. There are ~4000 combinations so be careful when uncommenting the histogram code below.
import string
import numpy as np, pandas as pd
from matplotlib import pyplot as plt
np.random.seed(100)
# Generate 400,000 records (400 obs for 1000 individuals in 6 columns)
NIDS = 1000; NOBS = 400; NCOLS = 6
df = pd.DataFrame(np.random.randint(0, 4, size = (NIDS*NOBS, NCOLS)))
mapper = dict(zip(range(26), list(string.ascii_lowercase)))
df.replace(mapper, inplace = True)
cols = ['Service', 'Source', 'SubComponentType', \
'KPI', 'Device_Type', 'Access_type']
df.columns = cols
# Generate IDs for individuals
df['MasterCustomerID'] = np.repeat(range(NIDS), NOBS)
# Generate values of interest (to be plotted)
df['value2plot'] = np.random.rand(NIDS*NOBS)
# View the counts for each unique combination of column levels
df.groupby(cols).size()
# Do something with the different subsets (such as make histograms)
for levels, group in df.groupby(cols):
print(levels)
# fig, ax = plt.subplots()
# ax.hist(group['value2plot'])
# ax.set_title(", ".join(levels))
# plt.savefig("hist_" + "_".join(levels) + ".png")
# plt.close()

Categories

Resources