Convert *.csv file in DB with repeat second header several times - python

I have *.csv file that looks:
# time;P_O2, atm;P_He, atm;Flow O2, l/min;Flow He, l/min;FiO2 Env, %;FiO2 sens2, %;P mask, cm H2O;Tmask, gradC;Tnagr, gradC;V, ml;f, 1/min;Tzad, gradC;FiO2 zad, %;Flags;
# POWERON 01.11.2018 15:02:29
1;39;33;0;2;0;0;255;135;135;0;0;0;25;83886592;0
5;39;33;0;2;0;0;255;135;135;0;0;0;25;83886624;0
26;0;0;0;8;529;0;255;135;135;0;0;0;25;83886592;0
72;0;0;0;8;598;0;248;135;135;0;0;0;25;83886085;0
# POWERON 01.11.2018 15:04:02
1;0;0;0;7;0;0;255;135;135;0;0;0;25;83886592;0
2;0;0;0;7;113;0;255;135;135;0;0;0;25;83886085;0
# POWERON 01.11.2018 15:04:48
1;0;0;0;6;0;0;255;135;135;0;0;0;25;83886592;0
2;0;0;0;6;115;0;255;135;135;0;0;0;25;83886085;0
So, I try convert it to DB and make this one:
import sqlite3
import pandas as pd
conn = sqlite3.connect('mydb.db')
stud_data = pd.read_csv(r'Log/20181101.LOG', sep=';', engine='python')
stud_data.to_sql('interation', conn, if_exists='replace', index=False)
cur = conn.cursor()
for row in cur.execute('SELECT * FROM interation'):
print(row)
conn.close()
but my result is:
('# POWERON 01.11.2018 15:02:29', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
('1', 39.0, 33.0, 0.0, 2.0, 0.0, 0.0, 255.0, 135.0, 135.0, 0.0, 0.0, 0.0, 25.0, 83886592.0, 0.0)
('5', 39.0, 33.0, 0.0, 2.0, 0.0, 0.0, 255.0, 135.0, 135.0, 0.0, 0.0, 0.0, 25.0, 83886624.0, 0.0)
('26', 0.0, 0.0, 0.0, 8.0, 529.0, 0.0, 255.0, 135.0, 135.0, 0.0, 0.0, 0.0, 25.0, 83886592.0, 0.0)
('72', 0.0, 0.0, 0.0, 8.0, 598.0, 0.0, 248.0, 135.0, 135.0, 0.0, 0.0, 0.0, 25.0, 83886085.0, 0.0)
('# POWERON 01.11.2018 15:04:02', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
('1', 0.0, 0.0, 0.0, 7.0, 0.0, 0.0, 255.0, 135.0, 135.0, 0.0, 0.0, 0.0, 25.0, 83886592.0, 0.0)
('2', 0.0, 0.0, 0.0, 7.0, 113.0, 0.0, 255.0, 135.0, 135.0, 0.0, 0.0, 0.0, 25.0, 83886085.0, 0.0)
('# POWERON 01.11.2018 15:04:48', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
('1', 0.0, 0.0, 0.0, 6.0, 0.0, 0.0, 255.0, 135.0, 135.0, 0.0, 0.0, 0.0, 25.0, 83886592.0, 0.0)
I need value (date) in 'POWERON' insert second columns each other iteration like this:
POWERON; time; P_O2, atm; P_He, atm; ....
01.11.2018 15:02:29 1; 39; 33; ...
01.11.2018 15:02:29 5; 39; 33; ...
How I may do it simple? May be with pd.read_csv (some option?) or I must use some iteration on DB?
It's my first question, sorry for any mistakes design errors.

Use pd.read_csv is not sufficient to extract the datetime in comments. You have to parse the csvfile yourself. Furthermore, the number of column of header and data are unbalanced.
import sqlite3
import io
def load_csvfile(filepath):
with open(filepath) as csvfile:
data = io.StringIO()
line = csvfile.readline()
line = f"POWERON;{line.rsplit(';', 1)[0]}\n"
data.writelines([line])
for line in csvfile.readlines():
if line.startswith('# POWERON'):
dt = line[10:].strip()
else:
line = f"{dt};{line.rsplit(';', 1)[0]}\n"
data.writelines([line])
data.seek(0)
return pd.read_csv(data, sep=';', parse_dates=['POWERON'])
conn = sqlite3.connect('mydb.db')
stud_data = load_csvfile(r'Log/20181101.LOG')
stud_data.to_sql('interation', conn, if_exists='replace', index=False)
cur = conn.cursor()
for row in cur.execute('SELECT * FROM interation'):
print(row)
conn.close()
('2018-01-11 15:02:29', 1, 39, 33, 0, 2, 0, 0, 255, 135, 135, 0, 0, 0, 25, 83886592)
('2018-01-11 15:02:29', 5, 39, 33, 0, 2, 0, 0, 255, 135, 135, 0, 0, 0, 25, 83886624)
('2018-01-11 15:02:29', 26, 0, 0, 0, 8, 529, 0, 255, 135, 135, 0, 0, 0, 25, 83886592)
('2018-01-11 15:02:29', 72, 0, 0, 0, 8, 598, 0, 248, 135, 135, 0, 0, 0, 25, 83886085)
('2018-01-11 15:04:02', 1, 0, 0, 0, 7, 0, 0, 255, 135, 135, 0, 0, 0, 25, 83886592)
('2018-01-11 15:04:02', 2, 0, 0, 0, 7, 113, 0, 255, 135, 135, 0, 0, 0, 25, 83886085)
('2018-01-11 15:04:48', 1, 0, 0, 0, 6, 0, 0, 255, 135, 135, 0, 0, 0, 25, 83886592)
('2018-01-11 15:04:48', 2, 0, 0, 0, 6, 115, 0, 255, 135, 135, 0, 0, 0, 25, 83886085)
Note: you will receive an UserWarning about the spaces in column names.

Related

How would I be able to find the average from a list of lists given two constaints?

So I have a list of lists, where the 7th index of each sublist contains the value I am interested in averaging, however, the numbers must be averaged according to their type. This type to be matched upond can be found at the 11th index of the sublist.
Below is some code I wrote. In this exa
# Open the csv file
opened_file = open('AppleStore.csv')
from csv import reader
read_file = reader(opened_file)
# Store the data as a list or arrays
apps_data = list(read_file)
# idx_num = index number of interest
# list_doc = the list of lists
# row_start = 1
def extract(idx_num,list_doc,row_start=1):
a_list = []
for row in list_doc[row_start:]:
var = row[idx_num]
a_list.append(var)
return a_list
# Use the extract function to get an array
a_list = extract(11, apps_data, 0)
# Find unique elements
a_list_set = set(a_list)
# Create a dictionary with initial values at [0,0]
dic = dict.fromkeys(a_list_set,[0,0])
print(dic)
# Works as intended
#{'Weather': [0, 0], 'Sports': [0, 0], 'Productivity': [0, 0], 'Games': [0, #0], 'News': [0, 0], 'Finance': [0, 0], 'Education': [0, 0], #'Entertainment': [0, 0], 'Health & Fitness': [0, 0], 'Business': [0, 0], #'Social Networking': [0, 0], 'prime_genre': [0, 0], 'Photo & Video': [0, #0], 'Navigation': [0, 0], 'Music': [0, 0], 'Medical': [0, 0], 'Travel': #[0, 0], 'Reference': [0, 0], 'Shopping': [0, 0], 'Utilities': [0, 0], #'Food & Drink': [0, 0], 'Lifestyle': [0, 0], 'Catalogs': [0, 0], 'Book': #[0, 0]}
for row in apps_data[1:]:
price = float(row[4])
genre = row[11]
# Here is the issue:
# I thought that this would allow for the genre instance to be matched to the appropriate key and then I could append my values.
if genre in dic.keys():
dic[genre][0] += 1
dic[genre][1] += (price)
else:
dic[genre][0] = 1
dic[genre][1] = price
print(dic)
## From here I would extract the array contents of the dictionary
for genre in a_list_set:
print(str(genre) + " mean price:" + str(round(dic[genre][1]/dic[genre][0], 2)))
I got this instead.
{'Weather': [7197, 12423.58999999945], 'Sports': [7197, 12423.58999999945], 'Productivity': [7197, 12423.58999999945], 'Games': [7197, 12423.58999999945], 'News': [7197, 12423.58999999945], 'Finance': [7197, 12423.58999999945], 'Education': [7197, 12423.58999999945], 'Entertainment': [7197, 12423.58999999945], 'Health & Fitness': [7197, 12423.58999999945], 'Business': [7197, 12423.58999999945], 'Social Networking': [7197, 12423.58999999945], 'prime_genre': [7197, 12423.58999999945], 'Photo & Video': [7197, 12423.58999999945], 'Navigation': [7197, 12423.58999999945], 'Music': [7197, 12423.58999999945], 'Medical': [7197, 12423.58999999945], 'Travel': [7197, 12423.58999999945], 'Reference': [7197, 12423.58999999945], 'Shopping': [7197, 12423.58999999945], 'Utilities': [7197, 12423.58999999945], 'Food & Drink': [7197, 12423.58999999945], 'Lifestyle': [7197, 12423.58999999945], 'Catalogs': [7197, 12423.58999999945],'Book': [7197, 12423.58999999945]}
We can do this with itertools.groupby; first, we extract the "columns" of concern from our data, constituting the 7th and 11th value of each row, into subset, also sorting by the 11th value.
Then, we use groupby to partition our subset into groups, where each group's members all have the same 2nd element (the original 11th element). We can then use a dict comprehension to get the mean of the 1st element of each group's members.
from itertools import groupby
from operator import itemgetter
from statistics import mean
subset = sorted(((row[6], row[10]) for row in data), key=itemgetter(1))
result = {key: mean(map(itemgetter(0), group)) for key, group in groupby(subset, itemgetter(1))}
print(result)
Some sample data:
[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -4.926456602181107, 0.0, 0.0, 0.0, 'this'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -4.261928508086729, 0.0, 0.0, 0.0, 'that'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 6.582427615396794, 0.0, 0.0, 0.0, 'other'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.08345371286375847, 0.0, 0.0, 0.0, 'other'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6323414510835206, 0.0, 0.0, 0.0, 'this'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -7.755177634382969, 0.0, 0.0, 0.0, 'this'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -5.948058847184649, 0.0, 0.0, 0.0, 'that'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -5.767820549798114, 0.0, 0.0, 0.0, 'other'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.609131600539092, 0.0, 0.0, 0.0, 'this'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.2106567350536854, 0.0, 0.0, 0.0, 'that'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -3.1550716372338297, 0.0, 0.0, 0.0, 'other'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.6037278107842077, 0.0, 0.0, 0.0, 'that'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -11.819322083983815, 0.0, 0.0, 0.0, 'this'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.441817745217389, 0.0, 0.0, 0.0, 'other'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4961079817344718, 0.0, 0.0, 0.0, 'other'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.269603775378254, 0.0, 0.0, 0.0, 'this'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.42023137240633596, 0.0, 0.0, 0.0, 'this'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.855652365179269, 0.0, 0.0, 0.0, 'this'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -8.048026683773955, 0.0, 0.0, 0.0, 'that'],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -4.577046681982131, 0.0, 0.0, 0.0, 'this']]
And the result:
{'other': 0.585667907075492,
'that': -3.530217022955171,
'this': -0.9035005758618025}

Python:Convert list to dict for assign specific data to variable

Code:
a={'day': [{'average_price': 9.3,
'buy_m2m': 9.3,
'buy_price': 9.3,
'buy_quantity': 1,
'buy_value': 9.3,
'close_price': 0,
'exchange': 'NSE',
'instrument_token': 2867969,
'last_price': 9.3,
'm2m': 0.0,
'multiplier': 1,
'net_buy_amount_m2m': 9.3,
'net_sell_amount_m2m': 0,
'overnight_quantity': 0,
'pnl': 0.0,
'product': 'MIS',
'quantity': 1,
'realised': 0,
'sell_m2m': 0,
'sell_price': 0,
'sell_quantity': 0,
'sell_value': 0,
'tradingsymbol': 'SUBEX',
'unrealised': 0.0,
'value': -9.3}],
'net': [{'average_price': 9.3,
'buy_m2m': 9.3,
'buy_price': 9.3,
'buy_quantity': 1,
'buy_value': 9.3,
'close_price': 0,
'exchange': 'NSE',
'instrument_token': 2867969,
'last_price': 9.3,
'm2m': 0.0,
'multiplier': 1,
'net_buy_amount_m2m': 9.3,
'net_sell_amount_m2m': 0,
'overnight_quantity': 0,
'pnl': 0.0,
'product': 'MIS',
'quantity': 1,
'realised': 0,
'sell_m2m': 0,
'sell_price': 0,
'sell_quantity': 0,
'sell_value': 0,
'tradingsymbol': 'SUBEX',
'unrealised': 0.0,
'value': -9.3}]}
b= a['day']
a shows dict type variable in python. I want to assign value of buy_price which is 9.3 to variable ``x and value of instrument_token which is 2867969 to variable y.
Now problem is after using b=a['day'], b variable becomes list in python so I can not use x=b['buy_price'] to get x=9.3.
What about x=b[0]['buy_price'] ?
Now problem is after using b=a['day'], b variable becomes list
It doesn't "become" a list, it's already one: look at your a dict, a['day'] IS a list - containing one single dict.
so I can not use x=b['buy_price']
Obviously not. buy_price is a key in the dict contained by a['day'], so you must first reference that dict using b[0], and then you can get the keys you want, ie b[0]['buy_price'].
Try this:
x = b[0]['buy_price']
y = b[0]['instrument_token']

Getting feature names after one-hot encoding

I have a dataset that I've recently transformed through one-hot encoding and used it trained a lasso logistic regression on it. I'm trying to get a list of the non-zero coefficients. I can get a list of the coefficients through sklearn but I'm not sure how to map them back to the data after one hot encoding.
A small excerpt of the dataset (pre one hot encoding is below)
{'acc_now_delinq': {29601: 0.0,
143234: 0.0,
157345: 0.0,
158754: 0.0,
229042: 0.0},
'application_type': {29601: 0, 143234: 0, 157345: 0, 158754: 0, 229042: 0},
'collections_12_mths_ex_med': {29601: 0.0,
143234: 0.0,
157345: 0.0,
158754: 0.0,
229042: 0.0},
'credit_age': {29601: 118.0,
143234: 157.0,
157345: 213.0,
158754: 269.0,
229042: 240.0},
'delinq_2yrs': {29601: 0.0,
143234: 0.0,
157345: 0.0,
158754: 0.0,
229042: 0.0},
'dti': {29601: 2.0600000000000001,
143234: 23.710000000000001,
157345: 18.960000000000001,
158754: 18.690000000000001,
229042: 22.530000000000001},
'emp_length_num': {29601: 8.0,
143234: 2.0,
157345: 1.0,
158754: 7.0,
229042: 1.0},
'home_ownership': {29601: 4, 143234: 5, 157345: 5, 158754: 1, 229042: 1},
'inq_last_6mths': {29601: 2.0,
143234: 0.0,
157345: 0.0,
158754: 0.0,
229042: 0.0},
'loan_amnt': {29601: 214.0,
143234: 211.0,
157345: 571.0,
158754: 937.0,
229042: 466.0},
'loan_status': {29601: 0, 143234: 1, 157345: 0, 158754: 1, 229042: 1},
'log_annual_inc': {29601: 11.225243392499999,
143234: 10.8022251252,
157345: 11.0020998412,
158754: 11.6952470218,
229042: 11.225243392499999},
'open_acc': {29601: 5.0,
143234: 21.0,
157345: 11.0,
158754: 9.0,
229042: 14.0},
'pub_rec': {29601: 0.0, 143234: 0.0, 157345: 0.0, 158754: 0.0, 229042: 0.0},
'purpose': {29601: 4, 143234: 2, 157345: 2, 158754: 2, 229042: 2},
'revol_bal': {29601: 2266.0,
143234: 12254.0,
157345: 20657.0,
158754: 11367.0,
229042: 39404.0},
'revol_inc_ratio': {29601: 0.030213333333299997,
143234: 0.24941990637100001,
157345: 0.34428333333300004,
158754: 0.094725000000000004,
229042: 0.52538666666699996},
'revol_util': {29601: 44.0,
143234: 89.400000000000006,
157345: 76.900000000000006,
158754: 81.200000000000003,
229042: 95.5},
'tot_coll_amt': {29601: 0.0,
143234: 0.0,
157345: 0.0,
158754: 0.0,
229042: 0.0},
'tot_cur_bal': {29601: 2266.0,
143234: 115947.0,
157345: 80598.0,
158754: 347695.0,
229042: 355741.40000000002},
'total_acc': {29601: 5.0,
143234: 41.0,
157345: 35.0,
158754: 17.0,
229042: 30.0},
'total_rev_hi_lim': {29601: 5100.0,
143234: 13700.0,
157345: 26900.0,
158754: 14000.0,
229042: 80780.0},
'verification_status': {29601: 0, 143234: 2, 157345: 1, 158754: 2, 229042: 1}}
And my one-hot encoding code:
def one_hot(df):
# Categorical columns for use in one-hot encoder
categorical = (df.dtypes.values != np.dtype('float64'))
print categorical
# Get numpy array from data
x = df.values[:, :-1]
y = df.values[:, -1]
# Apply one hot endcoing
encoder = preprocessing.OneHotEncoder(categorical_features=categorical[:-1], sparse=False) # Last value in mask is y
x = encoder.fit_transform(x)
return x, y
Assuming you have your small excerpt of the dataset stored in a variable called temp:
temp = pd.DataFrame(temp)
categorical = (temp.dtypes.values != np.dtype('float64'))
categorical = temp.columns[categorical]
def one_hot(temp, categorical):
# temp is the data frame from which the "categorical" columns need to be One hot encoded
from sklearn.preprocessing import OneHotEncoder
enc_model = OneHotEncoder(sparse=False)
X = enc_model.fit_transform(temp[categorical])
uniq_vals = temp[categorical].apply(lambda x: x.value_counts()).unstack()
uniq_vals = uniq_vals[~uniq_vals.isnull()]
enc_cols = list(uniq_vals.index.map('{0[0]}_{0[1]}'.format)) # https://stackoverflow.com/questions/41987743/merge-two-multiindex-levels-into-one-in-pandas
enc_df = pd.DataFrame(X, columns=enc_cols, index=temp.index, dtype='bool')
return(enc_df)

How to create a list from existing list in python

I am having a list in below format. How can i create another list from the existing one with just selected elements.
[{'UserDiscount': 0.0, 'CostTotalInvEOPAmount': 940.0, 'WeekEndingData': u'2016-10-08', 'WeeksOnHand': 0.0, 'UnitTotalInvEOPQuantity': 250.0, 'WeeksOfSales': 0.0, 'UnitCostAmount': 3.76, 'Week': u'2016 Wk 36', 'CostReceiptAmount': 940.0, 'UnitSalesQuantity': 0.0, 'UnitReceiptQuantity': 250.0, 'Demand': 0.0, 'InventoryBOP': 0.0, 'PEMDiscount': 0.0, 'ElasticLift': 0.0, 'StoreCount': 0, 'PriceStatus': 4, 'UnitOnOrderQuantity': None, 'ReceiptSizeContributions': [{u'sizeId': u'e1656ac7-1cc1-40ce-b485-989bba9d758d', u'contribution': 1.0}], 'CostSalesAmount': 0.0, 'LifeCycleProperties': {u'IsAtRegularPrice': False, u'IsAtMarkdown': False, u'IsFinished': False, u'IsPreSeason': True}, 'MardownDiscount': 0.0, 'RecommendedReceipt': 250.0, 'RecommendedReceiptSizeContributions': [{u'sizeId': u'e1656ac7-1cc1-40ce-b485-989bba9d758d', u'contribution': 1.0}], 'UnitTotalInvBOPQuantity': 0.0, 'CostOnOrderAmount': None, 'InventoryEOP': 250.0, 'CostTotalInvBOPAmount': 0.0, 'Receipt': 250.0, 'Sales': 0.0, 'LostSales': 0.0, 'TotalDiscount': 0.0, 'RetailSalesAmount': 0.0},
{'UserDiscount': 0.0, 'CostTotalInvEOPAmount': 940.0, 'WeekEndingData': u'2016-10-15', 'WeeksOnHand': 0.0, 'UnitTotalInvEOPQuantity': 250.0, 'WeeksOfSales': 15.784951285314385, 'UnitCostAmount': 3.76, 'Week': u'2016 Wk 37', 'CostReceiptAmount': 0.0, 'UnitSalesQuantity': 0.0, 'UnitReceiptQuantity': 0.0, 'Demand': 0.0, 'InventoryBOP': 250.0, 'PEMDiscount': 0.0, 'ElasticLift': 0.0, 'StoreCount': 0, 'PriceStatus': 4, 'UnitOnOrderQuantity': None, 'ReceiptSizeContributions': [], 'CostSalesAmount': 0.0, 'LifeCycleProperties': {u'IsAtRegularPrice': False, u'IsAtMarkdown': False, u'IsFinished': False, u'IsPreSeason': True}, 'MardownDiscount': 0.0, 'RecommendedReceipt': 0.0, 'RecommendedReceiptSizeContributions': [], 'UnitTotalInvBOPQuantity': 250.0, 'CostOnOrderAmount': None, 'InventoryEOP': 250.0, 'CostTotalInvBOPAmount': 940.0, 'Receipt': 0.0, 'Sales': 0.0, 'LostSales': 0.0, 'TotalDiscount': 0.0, 'RetailSalesAmount': 0.0}]
My new list will having below elements.
[{'UserDiscount': 0.0, 'CostTotalInvEOPAmount': 940.0, 'WeekEndingData': u'2016-10-08', 'WeeksOnHand': 0.0, 'UnitTotalInvEOPQuantity': 250.0, 'WeeksOfSales': 0.0, 'UnitCostAmount': 3.76, 'Week': u'2016 Wk 36', 'CostReceiptAmount': 940.0, 'UnitSalesQuantity': 0.0, 'UnitReceiptQuantity': 250.0, 'Demand': 0.0, 'InventoryBOP': 0.0, 'PEMDiscount': 0.0, 'ElasticLift': 0.0, 'StoreCount': 0, 'PriceStatus': 4, 'UnitOnOrderQuantity': None, 'CostSalesAmount': 0.0, 'RecommendedReceipt': 250.0, 'RetailSalesAmount': 0.0},
{'UserDiscount': 0.0, 'CostTotalInvEOPAmount': 940.0, 'WeekEndingData': u'2016-10-15', 'WeeksOnHand': 0.0, 'UnitTotalInvEOPQuantity': 250.0, 'WeeksOfSales': 15.784951285314385, 'UnitCostAmount': 3.76, 'Week': u'2016 Wk 37', 'CostReceiptAmount': 0.0, 'UnitSalesQuantity': 0.0, 'UnitReceiptQuantity': 0.0, 'Demand': 0.0, 'InventoryBOP': 250.0, 'PEMDiscount': 0.0, 'ElasticLift': 0.0, 'StoreCount': 0, 'PriceStatus': 4, 'UnitOnOrderQuantity': None, 'CostSalesAmount': 0.0, 'RecommendedReceipt': 0.0, 'RetailSalesAmount': 0.0}]
You have a list with two dictionaries. To filter the dictionaries you can try
keep=[key1,key2] #keys you wanna keep
newList = []
for item in mylist:
d = dict((key,value) for key, value in item.iteritems() if key in keep)
newlist.append(d)
del mylist
Also using funcy you can do a
import funcy
mydict={1:1,2:2,3:3}
keep=[1,2]
funcy.project(mydict,keep)
=> {1: 1, 2: 2}
which is much prettier imho.
You could use the list comprehension https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions
[l for l in your_list if l['UserDiscount'] >= 1 ]
[{'UserDiscount': l['UserDiscount'],'CostTotalInvEOPAmount': l['CostTotalInvEOPAmount']} for l in your_list ]
Using this way you can filter the elements in your list and change the structure of your dicts in the list

assigning different data types for different columns in a numpy array

I have a numpy array, with (8000000, 7) shape.
I want to keep the first 6 columns of the numpy array as float32 data type, and last column as int8 type.
And at the end, I want to save it as a csv file.
How can I manage this?
You could construct a structured array, but I wonder if you need to, especially if all you want is a csv file. The fmt parameter controls how savetxt writes the columns.
First with the default fmt and column_stack:
In [1484]: a=np.random.rand(5,3)
In [1485]: b=np.arange(5,dtype=np.int8)
In [1486]: np.savetxt('test.txt',np.column_stack((a,b)))
In [1487]: cat test.txt
3.513972543477327237e-01 8.468274950931957701e-01 6.587019305719005180e-01 0.000000000000000000e+00
...
With a simpler float format:
In [1492]: np.savetxt('test.txt',np.column_stack((a,b)),fmt='%f')
In [1493]: cat test.txt
0.351397 0.846827 0.658702 0.000000
0.566257 0.419570 0.183939 1.000000
0.276351 0.341277 0.706639 2.000000
0.515183 0.296801 0.321054 3.000000
0.305349 0.407097 0.328825 4.000000
Or by specifying format for each column:
In [1496]: np.savetxt('test.txt',np.column_stack((a,b)),fmt=['%f']*3+['%d'])
In [1497]: cat test.txt
0.351397 0.846827 0.658702 0
0.566257 0.419570 0.183939 1
0.276351 0.341277 0.706639 2
0.515183 0.296801 0.321054 3
0.305349 0.407097 0.328825 4
==============================
A nice way of constructing a structured array with data like this is to define 2 fields, and make the first an array:
In [1503]: dt=np.dtype('(3)f,i8')
In [1504]: A=np.empty((5,),dtype=dt)
In [1505]: A['f0']=a
In [1506]: A['f1']=b
In [1507]: A
Out[1507]:
array([([0.35139724612236023, 0.846827507019043, 0.6587019562721252], 0),
([0.566256582736969, 0.41956955194473267, 0.18393920361995697], 1),
([0.27635079622268677, 0.3412773013114929, 0.706638514995575], 2),
([0.5151825547218323, 0.29680076241493225, 0.32105395197868347], 3),
([0.30534881353378296, 0.4070965051651001, 0.3288247585296631], 4)],
dtype=[('f0', '<f4', (3,)), ('f1', '<i8')])
Unfortunately savetxt can't handle that kind of 'nested' dtype. The best I can do is format the first field as a string, with []
In [1509]: np.savetxt('test.txt',A,fmt=['%s','%d'])
In [1511]: cat test.txt
[ 0.35139725 0.84682751 0.65870196] 0
[ 0.56625658 0.41956955 0.1839392 ] 1
[ 0.2763508 0.3412773 0.70663851] 2
[ 0.51518255 0.29680076 0.32105395] 3
[ 0.30534881 0.40709651 0.32882476] 4
Instead I need to make a flat dtype; with the same bytes layout I can apply it with a view (or construct the array from scratch)
In [1512]: dt1=np.dtype('f,f,f,i8')
In [1514]: A.view(dt1)
Out[1514]:
array([(0.35139724612236023, 0.846827507019043, 0.6587019562721252, 0),
(0.566256582736969, 0.41956955194473267, 0.18393920361995697, 1),
(0.27635079622268677, 0.3412773013114929, 0.706638514995575, 2),
(0.5151825547218323, 0.29680076241493225, 0.32105395197868347, 3),
(0.30534881353378296, 0.4070965051651001, 0.3288247585296631, 4)],
dtype=[('f0', '<f4'), ('f1', '<f4'), ('f2', '<f4'), ('f3', '<i8')])
Now I can write it with the same fmt as before:
In [1515]: np.savetxt('test.txt',A.view(dt1),fmt=['%f']*3+['%d'])
In [1516]: cat test.txt
0.351397 0.846828 0.658702 0
0.566257 0.419570 0.183939 1
0.276351 0.341277 0.706639 2
0.515183 0.296801 0.321054 3
0.305349 0.407097 0.328825 4
If one or more of your columns was strings then you would need to use structured array. But as long as all the columns are numbers, you can get by with an all-float array, and control the print with the fmt.
I thought it would be relatively easy to break up the array into floats and ints and then use a combination of zip and np.savetxt to put it all back together in the csv. But Support zip input in savetxt in Python 3 suggests that way lies madness.
However, being stuck on the zip idea, I just moved the work to the standard csv module. Since numpy data needs to be converted to python types it may be a bit slower. But we're talking csv writing here so hopefully its just lost in the noise.
First, generate the test array
>>> import numpy as np
>>> array = np.arange(0., 18.*5, 5., dtype=np.float32).reshape((3,6))
>>> array
array([[ 0., 5., 10., 15., 20., 25.],
[ 30., 35., 40., 45., 50., 55.],
[ 60., 65., 70., 75., 80., 85.]], dtype=float32)
Split out the final column and recast as uint8
>>> floats, ints, _after = np.hsplit(array, (5,6))
>>> ints=ints.astype(np.uint8)
>>> floats
array([[ 0., 5., 10., 15., 20.],
[ 30., 35., 40., 45., 50.],
[ 60., 65., 70., 75., 80.]], dtype=float32)
>>> ints
array([[25],
[55],
[85]], dtype=uint8)
Use the python csv module to do the writes. You need to cast the zipped array rows to tuples and add them together to go from np.array to python data types.
>>> import csv
>>> writer = csv.writer(open('test.csv', 'w'))
>>> writer.writerows(tuple(f)+tuple(i) for f,i in zip(floats, ints))
>>> del writer
>>> print(open('test.csv').read())
0.0,5.0,10.0,15.0,20.0,25
30.0,35.0,40.0,45.0,50.0,55
60.0,65.0,70.0,75.0,80.0,85
Well you can construct the dtype then use zeros or empty to get an empty shell ready for data. Hopefully this will give you a few ideas
>>> import numpy as np
>>>
>>> flds = ["f{:0>{}}".format(i,2) for i in range(7)]
>>> dt = [(fld, 'float32') for fld in flds]
>>> dt.append(('i01', 'int8'))
>>> a = np.zeros((10,), dtype=dt)
>>> a
array([(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0)],
dtype=[('f00', '<f4'), ('f01', '<f4'), ('f02', '<f4'), ('f03', '<f4'), ('f04', '<f4'), ('f05', '<f4'), ('f06', '<f4'), ('i01', 'i1')])
>>>
Mess around with this example def
def num_45():
"""(num_45)...
"""
import numpy as np
flds = ["f{:0>{}}".format(i,2) for i in range(7)]
dt = [(fld, 'float32') for fld in flds]
dt.append(('i01', 'int8'))
a = np.zeros((10,), dtype=dt)
b = np.arange(10*8).reshape(10,8)
c = np.copy(a)
names = a.dtype.names
N = len(names)
for i in range(N):
c[names[i]] = b[:,i]
return a, b, c
Result
>>> a
array([(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0)],
dtype=[('f00', '<f4'), ('f01', '<f4'), ('f02', '<f4'), ('f03', '<f4'), ('f04', '<f4'), ('f05', '<f4'), ('f06', '<f4'), ('i01', 'i1')])
>>> b
array([[ 0, 1, 2, 3, 4, 5, 6, 7],
[ 8, 9, 10, 11, 12, 13, 14, 15],
[16, 17, 18, 19, 20, 21, 22, 23],
[24, 25, 26, 27, 28, 29, 30, 31],
[32, 33, 34, 35, 36, 37, 38, 39],
[40, 41, 42, 43, 44, 45, 46, 47],
[48, 49, 50, 51, 52, 53, 54, 55],
[56, 57, 58, 59, 60, 61, 62, 63],
[64, 65, 66, 67, 68, 69, 70, 71],
[72, 73, 74, 75, 76, 77, 78, 79]])
>>> c
array([(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7),
(8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15),
(16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23),
(24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31),
(32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39),
(40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47),
(48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55),
(56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63),
(64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71),
(72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79)],
dtype=[('f00', '<f4'), ('f01', '<f4'), ('f02', '<f4'), ('f03', '<f4'), ('f04', '<f4'), ('f05', '<f4'), ('f06', '<f4'), ('i01', 'i1')])
Another example with a few lines of manual code to see the construction
n = ['It', 'is', 'easy']
dt = [(n[0], '<f8'), (n[1], '<i8'), (n[2], 'U5')]
d = np.zeros((10,), dtype=dt)
for i in range(len(n)):
d[n[i]] = b[:, i]
yields
>>> d.dtype.names
('It', 'is', 'easy')
>>> d.reshape(10,-1)
array([[(0.0, 1, '2')],
[(8.0, 9, '10')],
[(16.0, 17, '18')],
[(24.0, 25, '26')],
[(32.0, 33, '34')],
[(40.0, 41, '42')],
[(48.0, 49, '50')],
[(56.0, 57, '58')],
[(64.0, 65, '66')],
[(72.0, 73, '74')]],
dtype=[('It', '<f8'), ('is', '<i8'), ('easy', '<U5')])

Categories

Resources