I am able to write a for loop that adds a row to a dataframe each time as with the following example:
from random import randint
import numpy as np
dataframe = []
for i in range(2):
value = randint(0,10)
for j in range(2):
mean = np.mean(value)
dataframe.append(mean)
cols=['mean']
result=pd.DataFrame(dataframe, columns = cols)
result
This outputs a dataframe that looks like:
mean
8
8
9
9
How could I output a dataframe that looks like
mean_1 mean_2 mean_3 mean_4
8 8 9 9
I made the dataframe a pandas dataframe from the beginning. And then there are multiple ways to add an column. Add Column
from random import randint
import numpy as np
import pandas as pd
df = pd.DataFrame()
counter = 0
for i in range(2):
value = randint(0,10)
for j in range(2):
counter += 1
mean = np.mean(value)
column_name = "mean_" + str(counter)
df.loc[1, column_name] = mean
As Answer to the comment and I also moved the line where value is set. There it depends of course if you want the same number per column or everywhere a new number:
from random import randint
import numpy as np
import pandas as pd
df = pd.DataFrame()
for i in range(2):
for j in range(2):
value = randint(0,10)
mean = np.mean(value)
column_name = "mean_" + str(i + 1)
df.loc[j + 1, column_name] = mean
import pandas as pd
from random import randint
import numpy as np
m = 2
n = 2
dataframe = pd.DataFrame([0], columns = ['mean_1'])
for i in range(m):
value = randint(0,10)
for j in range(n):
mean = np.mean(value)
dataframe['mean_'+str(n*i+j+1)] = mean
I tried to keep what wrote by adding few details: a counter for columns, and a assembling of your final dataframe. This is not the most optimised way but I mentionned tried to keep your logic.
from random import randint
import numpy as np
import pandas as pd
dataframe = []
count = 1
cols = []
for i in range(2):
value = randint(0,10)
for j in range(2):
mean = np.mean(value)
dataframe.append(mean)
cols.append('mean_'+str(count))
count = count + 1
df=pd.DataFrame(columns=cols)
a_series = pd.Series(dataframe, index = cols)
d = df.append(a_series, ignore_index=True)
Output:
Related
I have a subset dataframe from a much larger dataframe. I need to be able to create a for loop that searches through a dataframe and pull out the data corresponding to the correct name.
import pandas as pd
import numpy as np
import re
data = {'Name': ['CH_1', 'CH_2', 'CH_3', 'FV_1', 'FV_2', 'FV_3'],
'Value': [1, 2, 3, 4, 5, 6]
}
df = pd.DataFrame(data)
FL = [17.7, 60.0]
CH = [20, 81.4]
tol = 8
time1 = FL[0] + tol
time2 = FL[1] + tol
time3 = CH[0] + tol
time4 = CH[1] + tol
FH_mon = df['Values'] *5
workpercent = [.7, .92, .94]
mhpy = [2087, 2503, 3128.75]
list1 = list()
list2 = list()
for x in df['Name']:
if x == [(re.search('FV_', s)) for s in df['Name'].values]:
y = np.select([FH_mon < time1 , (FH_mon >= time1) and (FH_mon < time2), FH_mon > time2], [workpercent[0],workpercent[1],workpercent[2]])
z = np.select([FH_mon < time1 , (FH_mon >= time1) and (FH_mon < time2), FH_mon > time2], [mhpy[0],mhpy[1],mhpy[2]])
if x == [(re.search('CH_', s)) for s in df['Name'].values]:
y = np.select([FH_mon < time3, (FH_mon >= time3) and (FH_mon < time4)], [workpercent[0],workpercent[1]])
z = np.select([FH_mon < time3, (FH_mon >= time3) and (FH_mon < time4)], [mhpy[0],mhpy[1]])
list1.append(y)
list2.append(z)
I had a simple version earlier where I was just added a couple numbers, and I was getting really helpful answers to how I asked my question, but here is the more complex version. I need to search through and any time there is a FV in the name column, the if loop runs and uses data from the Name column with FV. Same for CH. I have the lists to keep track of each value as the loop loops through the Name column. If there is a simpler way I would really appreciate seeing it, but right now this seems like the cleanest way but I am receiving errors or the loop will not function properly.
This should be what you want:
for index, row in df.iterrows():
if re.search("FV_", row["Name"]):
df.loc[index, "Value"] += 2
elif re.search("CH_", row["Name"]):
df.loc[index, "Value"] += 4
If the "Name" column only has values starting with "FV_" or "CH_", use where:
df["Value"] = df["Value"].add(2).where(df["Name"].str.startswith("FV_"), df["Value"].add(4))
If you might have other values in "Name", use numpy.select:
import numpy as np
df["Value"] = np.select([df["Name"].str.startswith("FV_"), df["Name"].str.startswith("CH_")], [df["Value"].add(2), df["Value"].add(4)])
Output:
>>> df
Name Value
0 CH_1 5
1 CH_2 6
2 CH_3 7
3 FV_1 6
4 FV_2 7
5 FV_3 8
I have following function - mp_process(); would like to add progress bar, but running into a lot issue.
Look for help on how to add tqdm in mp_process
from gzip import READ
import http.client
import pandas as pd
import xml.etree.cElementTree as ET
import multiprocessing as mp
from tqdm import tqdm
def mp_rocess(df):
N_ROWS = 100 # number of rows in each dataframe
with mp.Pool(10) as pool: # use 3 processes
# break up dataframe into smaller daraframes of N_ROWS rows each
cnt = len(df.index)
n, remainder = divmod(cnt, N_ROWS)
results = []
start_index = 0
for i in range(n):
results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+N_ROWS-1, :],)))
start_index += N_ROWS
if remainder:
results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+remainder-1, :],)))
new_dfs = [result.get() for result in results]
# reassemble final dataframe:
ret_df = pd.concat(new_dfs, ignore_index=True)
def mp_rocess(df):
N_ROWS = 2 # number of rows in each dataframe
total_row = len(df)
pbar = tqdm(total=total_row)
with mp.Pool(10) as pool: # use 3 processes
# break up dataframe into smaller daraframes of N_ROWS rows each
cnt = len(df.index)
n, remainder = divmod(cnt, N_ROWS)
results = []
def update_bar(result):
pbar.update(N_ROWS) # this is just for the fancy progress bar
start_index = 0
for i in range(n):
#results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+N_ROWS-1, :],)))
results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+N_ROWS-1, :],) , callback=update_bar ))
start_index += N_ROWS
if remainder:
#results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+remainder-1, :],)))
results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+remainder-1, :],), callback=update_bar ))
new_dfs = [result.get() for result in results]
# reassemble final dataframe:
ret_df = pd.concat(new_dfs, ignore_index=True)
return ret_df
I have just started to learn python and don't have much of dev background. Here is the code I have written while learning.
I now want to make a function which exactly does what my "for" loop is doing but it needs to calculate different exp(exp,exp1 etc) based on different num(num, num1 etc)
how can I do this?
import pandas as pd
index = [0,1]
s = pd.Series(['a','b'],index= index)
t = pd.Series([1,2],index= index)
t1 = pd.Series([3,4],index= index)
df = pd.DataFrame(s,columns = ["str"])
df["num"] =t
df['num1']=t1
print (df)
exp=[]
for index, row in df.iterrows():
if(row['str'] == 'a'):
row['mul'] = -1 * row['num']
exp.append(row['mul'])
else:
row['mul'] = 1 * row['num']
exp.append(row['mul'])
df['exp'] = exp
print (df)
This is what i was trying to do which gives wrong results
import pandas as pd
index = [0,1]
s = pd.Series(['a','b'],index= index)
t = pd.Series([1,2],index= index)
t1 = pd.Series([3,4],index= index)
df = pd.DataFrame(s,columns = ["str"])
df["num"] =t
df['num1']=t1
def f(x):
exp=[]
for index, row in df.iterrows():
if(row['str'] == 'a'):
row['mul'] = -1 * x
exp.append(row['mul'])
else:
row['mul'] = 1 * x
exp.append(row['mul'])
return exp
df['exp'] = df['num'].apply(f)
df['exp1'] = df['num1'].apply(f)
df
Per suggestion below, I would do:
df['exp']=np.where(df.str=='a',df['num']*-1,df['num']*1)
df['exp1']=np.where(df.str=='a',df['num1']*-1,df['num1']*1)
I think you are looking for np.where
df['exp']=np.where(df.str=='a',df['num']*-1,df['num']*1)
df
Out[281]:
str num num1 exp
0 a 1 3 -1
1 b 2 4 2
Normal dataframe operation:
df["exp"] = df.apply(lambda x: x["num"] * (1 if x["str"]=="a" else -1), axis=1)
Mathematical dataframe operation:
df["exp"] = ((df["str"] == 'a')-0.5) * 2 * df["num"]
I have a database with ~ 50million rows. After reading to a database I only get 21,000 rows. What am I doing wrong? Thanks.
chunksize = 100000
csv_database = create_engine('sqlite:///csv_database.db', pool_pre_ping=True)
i=0
j=0
q=0
for df in pd.read_csv(filename, chunksize = chunksize, iterator = False):
# df = df.rename(columns={c: c.replace(' ', '') for c in df.columns})
df.index += j
i+= 1
df.to_sql('table', csv_database, if_exists='append')
j = df.index[-1] +1
q+=1
print("q: " + repr(q))
columnx = df.iloc[:,0]
columny = df.iloc[:,1]
columnz = df.iloc[:,2]
columnmass = df.iloc[:,3]
out: [21739 rows x 1 columns] etc etc.
in[19]: len(df)
Out[19]: 21739
'df' doesn't contain the entire csv file as you specified chunk size to 100000, and 21739 is the number of rows inserted in the last iteration.
If you do a count(1) of your table, I bet you'll get something like 5_21739.
Following code is working for me.
import numpy as np
import pandas as pd
import sqlite3
from sqlalchemy import create_engine
DIR = 'C:/Users/aslams/Desktop/checkpoint/'
FILE = 'SUBSCRIBER1.csv'
file = '{}{}'.format(DIR, FILE)
csv_database = create_engine('sqlite:///csv_database.db')
chunksize = 10000
i = 0
j = 0
for df in pd.read_csv(file, chunksize=chunksize, iterator=True):
df = df.rename(columns= {c: c.replace(' ', '') for c in df.columns})
df.index +=3
df.to_sql('data_use', csv_database, if_exists = 'append')
j = df.index[-1]+1
print('| index: {}',format(j))
I have this code:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.withColumn('age2', df.age + 2).toPandas()
Works fine, does what it needs to. Suppose though I only want to display the first n rows, and then call toPandas() to return a pandas dataframe. How do I do it? I can't call take(n) because that doesn't return a dataframe and thus I can't pass it to toPandas().
So to put it another way, how can I take the top n rows from a dataframe and call toPandas() on the resulting dataframe? Can't think this is difficult but I can't figure it out.
I'm using Spark 1.6.0.
You can use the limit(n) function:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.limit(2).withColumn('age2', df.age + 2).toPandas()
Or:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.withColumn('age2', df.age + 2).limit(2).toPandas()
You could get first rows of Spark DataFrame with head and then create Pandas DataFrame:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df_pandas = pd.DataFrame(df.head(3), columns=df.columns)
In [4]: df_pandas
Out[4]:
name age
0 Alice 1
1 Jim 2
2 Sandra 3
Try it:
def showDf(df, count=None, percent=None, maxColumns=0):
if (df == None): return
import pandas
from IPython.display import display
pandas.set_option('display.encoding', 'UTF-8')
# Pandas dataframe
dfp = None
# maxColumns param
if (maxColumns >= 0):
if (maxColumns == 0): maxColumns = len(df.columns)
pandas.set_option('display.max_columns', maxColumns)
# count param
if (count == None and percent == None): count = 10 # Default count
if (count != None):
count = int(count)
if (count == 0): count = df.count()
pandas.set_option('display.max_rows', count)
dfp = pandas.DataFrame(df.head(count), columns=df.columns)
display(dfp)
# percent param
elif (percent != None):
percent = float(percent)
if (percent >=0.0 and percent <= 1.0):
import datetime
now = datetime.datetime.now()
seed = long(now.strftime("%H%M%S"))
dfs = df.sample(False, percent, seed)
count = df.count()
pandas.set_option('display.max_rows', count)
dfp = dfs.toPandas()
display(dfp)
Examples of usages are:
# Shows the ten first rows of the Spark dataframe
showDf(df)
showDf(df, 10)
showDf(df, count=10)
# Shows a random sample which represents 15% of the Spark dataframe
showDf(df, percent=0.15)