Python Pandas Create Excel from Loop - python

Hello I'm trying to create a program which filtering my data from more than 20 excel file and I want to create a new excel file for export values from my loop. My code little bit complicated I'm new at python.
for index, row in data.iterrows():
def tarih_aralik(Depo):
try:
deneme5 = data.loc[(data['Tarih'] >= data_time) & (data['Tarih'] <= data_time2) & (data['Depo'] == Depo)]
right = deneme5.groupby(['Tarih', deneme5['Adet'] > 0])['Adet'].sum().unstack()
right = right.rename(columns={True: 'Positive', False: 'Negative'})
deneme5 = deneme5.join(right, on=None, how='right')
deneme5 = deneme5['Positive'].sum()
deneme5 = int(deneme5)
print(deneme5)
except:
print("0")
return result
k101 = data.loc[data['Depo'] == 'K101', 'Adet'].sum()
k104 = data.loc[data['Depo'] == 'K104', 'Adet'].sum()
a = print("-->",row.T_kod, "-", row.Açıklama, "<--","\n", k101, "adt K101 toplam",",", k104, "adt K104 toplam","\n",data_time,"--",data_time2)

Related

Performance problem when using pandas apply on big dataframes

Im having some performance issues with the code below, mostly because of the apply function that im using on a huge dataframe. I want to update the semi_dict dictionary with some other data that im calculating with the some functions. Is it any way to improve this?
def my_function_1(semi_dict, row):
#do some calculation/other stuff based on the row data and append it to the dictionary
random_dict = dict(data=some_data, more_data=more_data)
semi_dict["data"].append(random_dict)
def my_function_2(semi_dict, row):
#do some calculation/other stuff based on the row data and append it to the dictionary
random_dict = dict(data=some_data, more_data=more_data)
semi_dict["data2"].append(random_dict)
dictionary_list = []
for v in values:
df_1_rows = df_1_rows[(df_1_rows.values == v)]
df_2_rows = df_2_rows[(df_2_rows.values == v)]
semi_dict = dict(value=v, data=[], data2=[])
function = partial(my_function_1, semi_dict)
function_2 = partial(my_function_2, semi_dict)
df_1_rows.apply(lambda row : function(row), axis=1)
df_2_rows.apply(lambda row : function_2(row), axis=1)
dictionary_list.append(semi_dict)
This answer uses dictionary merge from How to merge dictionaries of dictionaries?, but depending on your use case, you might not need it in the end:
import pandas as pd
import random
len_df = 10
row_values = list("ABCD")
extra_col_values = list("12345")
df_1 = pd.DataFrame([[random.choice(row_values), random.choice(extra_col_values)] for _ in range(len_df)], columns=['col1', 'extra1'])
df_2 = pd.DataFrame([[random.choice(row_values), random.choice(extra_col_values)] for _ in range(len_df)], columns=['col2', 'extra2'])
def make_dict(df):
# some calculations on the df
return {
'data': df.head(1).values.tolist(),
}
def make_dict_2(df):
# some calculations on the df
return {
'data_2': df.head(1).values.tolist(),
}
def merge(a, b, path=None):
"merges b into a, taken from https://stackoverflow.com/questions/7204805/how-to-merge-dictionaries-of-dictionaries "
if path is None: path = []
for key in b:
if key in a:
if isinstance(a[key], dict) and isinstance(b[key], dict):
merge(a[key], b[key], path + [str(key)])
elif a[key] == b[key]:
pass # same leaf value
else:
raise Exception('Conflict at %s' % '.'.join(path + [str(key)]))
else:
a[key] = b[key]
return a
dict1 = df_1.groupby('col1').apply(make_dict).to_dict()
dict2 = df_2.groupby('col2').apply(make_dict_2).to_dict()
result = merge(dict1, dict2)
result

How do I speed up the for loop when using pandas?

Hastily, I once wrote the code for work. It was assumed that it would only need to be run once. But now you have to run it often. The source data unfortunately can not provide because they conferenceline. I am interested in the question, which parts of the code need to be corrected in order to get performance? The current version runs for about 2 hours, running about 25K iterations.
dfs = []
for x in tqdm_notebook(range(len(group_furn))):
good_id = group_furn.iloc[x, 0]
name_promo = group_furn.iloc[x, 1]
unique_stores = set(stores.loc[stores.store_type_id != 4]['store_id'].unique()) - \
set(promo_list.loc[(promo_list['good_id'] == good_id) & \
(promo_list['name_promo_mech'] == name_promo)].store_id.unique())
bu_stores = list(unique_stores.intersection(set(bu_furn.store_id.unique())))
main_stores = bu_store.loc[(bu_store.store_id.isin(bu_stores)) & (bu_store.store_type_id != 4)].main_store_id.unique()
df = promo_list.loc[(promo_list.good_id == good_id) & (promo_list.name_promo_mech == name_promo) &
(promo_list.store_id.isin(main_stores))]
bu = bu_store.loc[bu_store.main_store_id.isin(main_stores)]
df = pd.merge(df, bu, how='inner', left_on='store_id', right_on='main_store_id')
dfs.append(df)
main_stores = bu_store.loc[(bu_store.store_id.isin(bu_stores)) & (bu_store.store_type_id == 4)].main_store_id.unique()
owners = bu_store.loc[bu_store.main_store_id.isin(main_stores)].main_owner_id.unique()
df_2 = promo_list.loc[(promo_list.good_id == good_id) & (promo_list.name_promo_mech == name_promo) &
(promo_list.owner_id.isin(owners))]
bu = bu_store.loc[bu_store.main_store_id.isin(main_stores)]
df_2 = pd.merge(df_2, bu, how='inner', left_on='store_id', right_on='main_store_id')
dfs.append(df_2)

'Jump' Operation in Python to Skip Rows in DataFrame

I have an Excel file contain data like in that picture.
"doc_id" refers to the document ID where the text comes from. In our example, we have 4 documents (doc_id from 0 to 3).
I want to get the values of "text" in the first 5 pages of each document OR before Table of Contents.
With our example, it should return:
"A0","A1","B1","A3"
(Note that we don't want B0, C0, D0, C1 because they occur after Table of Contents of that document, and we don't want A2 and B3 because they have page_id >= 5)
I don't understand how we can create condition to "break" the iteration in each doc_id once we find Table of Contents or page_id == 5 and move to the next doc_id.
I tried like this and I'm stuck.
import pandas as pd
data = pd.read_csv('book2.csv')
test_data = data['text']
doc_id = data['doc_id']
page_id = data['page_id']
def TOC(text):
return 'content' in text
def new_doc():
if i==0:
return False
elif doc_id[i] != doc_id[i-1]:
return True
i=0
while i < len(test_data):
stop=0
while stop == 0 and not new_doc():
if TOC(test_data[i]):
print('toc')
stop=1
else:
print(doc_id[i],test_data[i])
i+=1
Appreciate your help. Thanks!
See if this helps
a = df[df.page_id<5]
def tex(x):
try:
if (x.any()):
i = x.index[x.str.contains('Table')][0]
except IndexError :
i = x.index[-1]+1
return i
a[a.index<a.groupby('doc_id')['text'].transform(tex)]['text'].to_list()
Output
['A0', 'A1', 'B1', 'A3']
you have to iterate through whole document
import pandas as pd
data = pd.read_csv('book2.csv')[['page_id', 'doc_id', 'text']]
curr_doc_id = -1
before_toc = False
for i, row in data.iterrows():
if curr_doc_id < row.doc_id:
curr_doc_id = row.doc_id
before_toc = True
if row.text == "Table of Contents":
before_toc = False
if before_toc and row.page_id < 5:
print(row)
*code wasn't tested

TypeError: 'DataFrame' object is not callable python function

I have two functions, one which creates a dataframe from a csv and another which manipulates that dataframe. There is no problem the first time I pass the raw data through the lsc_age(import_data()) functions. However, I get the above-referenced error (TypeError: 'DataFrame' object is not callable) upon second+ attempts. Any ideas for how to solve the problem?
def import_data(csv,date1,date2):
global data
data = pd.read_csv(csv,header=1)
data = data.iloc[:,[0,1,4,6,7,8,9,11]]
data = data.dropna(how='all')
data = data.rename(columns={"National: For Dates 9//1//"+date1+" - 8//31//"+date2:'event','Unnamed: 1':'time','Unnamed: 4':'points',\
'Unnamed: 6':'name','Unnamed: 7':'age','Unnamed: 8':'lsc','Unnamed: 9':'club','Unnamed: 11':'date'})
data = data.reset_index().drop('index',axis=1)
data = data[data.time!='Time']
data = data[data.points!='Power ']
data = data[data['event']!="National: For Dates 9//1//"+date1+" - 8//31//"+date2]
data = data[data['event']!='USA Swimming, Inc.']
data = data.reset_index().drop('index',axis=1)
for i in range(len(data)):
if len(str(data['event'][i])) <= 3:
data['event'][i] = data['event'][i-1]
else:
data['event'][i] = data['event'][i]
data = data.dropna()
age = []
event = []
gender = []
for row in data.event:
gender.append(row.split(' ')[0])
if row[:9]=='Female 10':
n = 4
groups = row.split(' ')
age.append(' '.join(groups[1:n]))
event.append(' '.join(groups[n:]))
elif row[:7]=='Male 10':
n = 4
groups = row.split(' ')
age.append(' '.join(groups[1:n]))
event.append(' '.join(groups[n:]))
else:
n = 2
groups = row.split(' ')
event.append(' '.join(groups[n:]))
groups = row.split(' ')
age.append(groups[1])
data['age_group'] = age
data['event_simp'] = event
data['gender'] = gender
data['year'] = date2
return data
def lsc_age(data_two):
global lsc, lsc_age, top, all_performers
lsc = pd.DataFrame(data_two['event'].groupby(data_two['lsc']).count()).reset_index().sort_values(by='event',ascending=False)
lsc_age = data_two.groupby(['year','age_group','lsc'])['event'].count().reset_index().sort_values(by=['age_group','event'],ascending=False)
top = pd.concat([lsc_age[lsc_age.age_group=='10 & under'].head(),lsc_age[lsc_age.age_group=='11-12'].head(),\
lsc_age[lsc_age.age_group=='13-14'].head(),lsc_age[lsc_age.age_group=='15-16'].head(),\
lsc_age[lsc_age.age_group=='17-18'].head()],ignore_index=True)
all_performers = pd.concat([lsc_age[lsc_age.age_group=='10 & under'],lsc_age[lsc_age.age_group=='11-12'],\
lsc_age[lsc_age.age_group=='13-14'],lsc_age[lsc_age.age_group=='15-16'],\
lsc_age[lsc_age.age_group=='17-18']],ignore_index=True)
all_performers = all_performers.rename(columns={'event':'no. top 100'})
all_performers['age_year_lsc'] = all_performers.age_group+' '+all_performers.year.astype(str)+' '+all_performers.lsc
return all_performers
years = [i for i in range(2008,2018)]
for i in range(len(years)-1):
lsc_age(import_data(str(years[i+1])+"national100.csv",\
str(years[i]),str(years[i+1])))
During the first call to your function lsc_age() in line
lsc_age = data_two.groupby(['year','age_group','lsc'])['event'].count().reset_index().sort_values(by=['age_group','event'],ascending=False)
you are overwriting your function object with a dataframe. This is happening since you imported the function object from the global namespace with
global lsc, lsc_age, top, all_performers
Functions in Python are objects. Please see more information about this here.
To solve your problem, try to avoid the global imports. They do not seem to be necessary. Try to pass your data around through the arguments of the function.

Python unified diff with line numbers from both "files"

I'm trying to figure out a way to create unified diffs with line numbers only showing N lines of context. I have been unable to do this with difflib.unified_diff. I need to show changes in both files.
The closest I can come is using diff on the command line like so:
/usr/bin/diff
--unchanged-line-format=' %.2dn %L'
--old-line-format="-%.2dn %L"
--new-line-format="+%.2dn %L"
file1.py
file2.py
BUT I only want to show N lines of context, and /usr/bin/diff doesn't seem to support context with a custom line format (eg. -U2 is not compatible with --line-format "conflicting output style options").
Below is an example of what I'd like to accomplish (the same output as the above diff, but only showing 1 line of context surrounding changes):
+01: def renamed_function()
-01: def original_function():
02:
+03: """ Neat stuff here """
04:
21:
+22: # Here's a new comment
23:
85: # Output the value of foo()
+86: print "Foo is %s"%(foo())
-86: print foo()
87:
I was able to figure out something very close to what I wanted to do. It's slower than regular diff, though. Here's the entire code, from my project GitGate.
def unified_diff(to_file_path, from_file_path, context=1):
""" Returns a list of differences between two files based
on some context. This is probably over-complicated. """
pat_diff = re.compile(r'## (.[0-9]+\,[0-9]+) (.[0-9]+,[0-9]+) ##')
from_lines = []
if os.path.exists(from_file_path):
from_fh = open(from_file_path,'r')
from_lines = from_fh.readlines()
from_fh.close()
to_lines = []
if os.path.exists(to_file_path):
to_fh = open(to_file_path,'r')
to_lines = to_fh.readlines()
to_fh.close()
diff_lines = []
lines = difflib.unified_diff(to_lines, from_lines, n=context)
for line in lines:
if line.startswith('--') or line.startswith('++'):
continue
m = pat_diff.match(line)
if m:
left = m.group(1)
right = m.group(2)
lstart = left.split(',')[0][1:]
rstart = right.split(',')[0][1:]
diff_lines.append("## %s %s ##\n"%(left, right))
to_lnum = int(lstart)
from_lnum = int(rstart)
continue
code = line[0]
lnum = from_lnum
if code == '-':
lnum = to_lnum
diff_lines.append("%s%.4d: %s"%(code, lnum, line[1:]))
if code == '-':
to_lnum += 1
elif code == '+':
from_lnum += 1
else:
to_lnum += 1
from_lnum += 1
return diff_lines

Categories

Resources