Here is an example dataset found from google search close to my datasets in my environment
I'm trying to get output like this
import pandas as pd
import numpy as np
data = {'Product':['Box','Bottles','Pen','Markers','Bottles','Pen','Markers','Bottles','Box','Markers','Markers','Pen'],
'State':['Alaska','California','Texas','North Carolina','California','Texas','Alaska','Texas','North Carolina','Alaska','California','Texas'],
'Sales':[14,24,31,12,13,7,9,31,18,16,18,14]}
df=pd.DataFrame(data, columns=['Product','State','Sales'])
df1=df.sort_values('State')
#df1['Total']=df1.groupby('State').count()
df1['line']=df1.groupby('State').cumcount()+1
print(df1.to_string(index=False))
Commented out line throws this error
ValueError: Columns must be same length as key
Tried with size() it gives NaN for all rows
Hope someone points me to right direction
Thanks in advance
I think this should work for 'Total':
df1['Total']=df1.groupby('State')['Product'].transform(lambda x: x.count())
Try this:
df = pd.DataFrame(data).sort_values("State")
grp = df.groupby("State")
df["Total"] = grp["State"].transform("size")
df["line"] = grp.cumcount() + 1
please help me to solve this, How to make new column in df with duration result? also result for all row. Thanks.
import pandas as pd
from datetime import time,datetime
from itertools import repeat
df = pd.read_csv("data.csv")
df['startdate_column'] = pd.to_datetime(df['startdate_column'])
df['enddate_column'] = pd.to_datetime(df['enddate_column'])
start_time=time(8,0,0)
end_time=time(17,0,0)
unit='min'
df['Duration'] = list(map(businessDuration,startdate=df['startdate_column'],enddate=df['enddate_column'],repeat(start_time),repeat(end_time),repeat(weekendlist=[6]),repeat(unit)))```
Use:
f = lambda x: businessDuration(startdate=x['startdate_column'],
enddate=x['enddate_column'],
starttime=start_time,
endtime=end_time,
weekendlist=[6],
unit=unit)
df['Duration'] = df.apply(f, axis=1)
I have a large dataframe that has certain cells which have values like: <25-27>. Is there a simple way to convert these into something like:25|26|27 ?
Source data frame:
import pandas as pd
import numpy as np
f = {'function':['2','<25-27>','200'],'CP':['<31-33>','210','4001']}
filter = pd.DataFrame(data=f)
filter
Output Required
output = {'function':['2','25|26|27','200'],'CP':['31|32|33','210','4001']}
op = pd.DataFrame(data=output)
op
thanks a lot !
import re
def convert_range(x):
m = re.match("<([0-9]+)+\-([0-9]+)>", x)
if m is None:
return x
s1, s2 = m.groups()
return "|".join([str(s) for s in range(int(s1), int(s2)+1)])
op = filter.applymap(convert_range)
I was wondering whether somebody could please give me some assistance with the Pandas iterrows package.
I'm currently using an iterative function which works but I was wondering whether using iterrows would make it more efficient to avoid a for loop?
import pandas as pd
import numpy as np
dataframe_1 = pd.read_csv("D\data\2018_19.csv")
def append_date_column(df):
df = df.copy()
df['date_column'] = np.nan
date_range = pd.date_range(start = '01/01/2001', periods = 207, freq = 'M').values
for row in range(df.shape[0]):
date_number = df.loc[row, "income2"]
if (not pd.isna(date_number)) and date_number < 207:
date = date_range[int(date_number) -1]
df.loc[row, 'date_column'] = date
df_with_date_column = df
return df_with_date_column
Thanks!
I have a Numpy array consisting of a list of lists, representing a two-dimensional array with row labels and column names as shown below:
data = array([['','Col1','Col2'],['Row1',1,2],['Row2',3,4]])
I'd like the resulting DataFrame to have Row1 and Row2 as index values, and Col1, Col2 as header values
I can specify the index as follows:
df = pd.DataFrame(data,index=data[:,0]),
however I am unsure how to best assign column headers.
You need to specify data, index and columns to DataFrame constructor, as in:
>>> pd.DataFrame(data=data[1:,1:], # values
... index=data[1:,0], # 1st column as index
... columns=data[0,1:]) # 1st row as the column names
edit: as in the #joris comment, you may need to change above to np.int_(data[1:,1:]) to have correct data type.
Here is an easy to understand solution
import numpy as np
import pandas as pd
# Creating a 2 dimensional numpy array
>>> data = np.array([[5.8, 2.8], [6.0, 2.2]])
>>> print(data)
>>> data
array([[5.8, 2.8],
[6. , 2.2]])
# Creating pandas dataframe from numpy array
>>> dataset = pd.DataFrame({'Column1': data[:, 0], 'Column2': data[:, 1]})
>>> print(dataset)
Column1 Column2
0 5.8 2.8
1 6.0 2.2
I agree with Joris; it seems like you should be doing this differently, like with numpy record arrays. Modifying "option 2" from this great answer, you could do it like this:
import pandas
import numpy
dtype = [('Col1','int32'), ('Col2','float32'), ('Col3','float32')]
values = numpy.zeros(20, dtype=dtype)
index = ['Row'+str(i) for i in range(1, len(values)+1)]
df = pandas.DataFrame(values, index=index)
This can be done simply by using from_records of pandas DataFrame
import numpy as np
import pandas as pd
# Creating a numpy array
x = np.arange(1,10,1).reshape(-1,1)
dataframe = pd.DataFrame.from_records(x)
>>import pandas as pd
>>import numpy as np
>>data.shape
(480,193)
>>type(data)
numpy.ndarray
>>df=pd.DataFrame(data=data[0:,0:],
... index=[i for i in range(data.shape[0])],
... columns=['f'+str(i) for i in range(data.shape[1])])
>>df.head()
[![array to dataframe][1]][1]
Here simple example to create pandas dataframe by using numpy array.
import numpy as np
import pandas as pd
# create an array
var1 = np.arange(start=1, stop=21, step=1).reshape(-1)
var2 = np.random.rand(20,1).reshape(-1)
print(var1.shape)
print(var2.shape)
dataset = pd.DataFrame()
dataset['col1'] = var1
dataset['col2'] = var2
dataset.head()
Adding to #behzad.nouri 's answer - we can create a helper routine to handle this common scenario:
def csvDf(dat,**kwargs):
from numpy import array
data = array(dat)
if data is None or len(data)==0 or len(data[0])==0:
return None
else:
return pd.DataFrame(data[1:,1:],index=data[1:,0],columns=data[0,1:],**kwargs)
Let's try it out:
data = [['','a','b','c'],['row1','row1cola','row1colb','row1colc'],
['row2','row2cola','row2colb','row2colc'],['row3','row3cola','row3colb','row3colc']]
csvDf(data)
In [61]: csvDf(data)
Out[61]:
a b c
row1 row1cola row1colb row1colc
row2 row2cola row2colb row2colc
row3 row3cola row3colb row3colc
I think this is a simple and intuitive method:
data = np.array([[0, 0], [0, 1] , [1, 0] , [1, 1]])
reward = np.array([1,0,1,0])
dataset = pd.DataFrame()
dataset['StateAttributes'] = data.tolist()
dataset['reward'] = reward.tolist()
dataset
returns:
But there are performance implications detailed here:
How to set the value of a pandas column as list
It's not so short, but maybe can help you.
Creating Array
import numpy as np
import pandas as pd
data = np.array([['col1', 'col2'], [4.8, 2.8], [7.0, 1.2]])
>>> data
array([['col1', 'col2'],
['4.8', '2.8'],
['7.0', '1.2']], dtype='<U4')
Creating data frame
df = pd.DataFrame(i for i in data).transpose()
df.drop(0, axis=1, inplace=True)
df.columns = data[0]
df
>>> df
col1 col2
0 4.8 7.0
1 2.8 1.2