I have a dataframe with a model id and associated values. The columns are date, client_id, model_id, category1, category2, color, and price. I have a simple flask app where the user can select a model id and add to their "purchase" history. Based on the model id I would like to add a row to the dataframe and bring the associated values of category1, category2, color, and price. What is the best way to do this using Pandas? I know in Excel I'd use a vlookup but I am unsure how to go about it using Python. Assume category1, category2, color, and price are unique to each model id.
client_id = input("ENTER Model ID: ")
model_id = input("ENTER Model ID: ")
def update_history(df, client_id, model_id):
today=pd.to_datetime('today')
#putting in tmp but just need to "lookup" these values from the original dataframe somehow
df.loc[len(df)]=[today, client_id, model_id, today, 'tmp', 'tmp','tmp', 'tmp']
return df
Code below adds a new row with new values to an existing dataframe. The list of new values could be passed in to the function.
Import libraries
import pandas as pd
import numpy as np
import datetime
Create sample dataframe
model_id = ['M1', 'M2', 'M3']
today = ['2018-01-01', '2018-01-02', '2018-01-01']
client_id = ['C1', 'C2', 'C3']
category1 = ['orange', 'apple', 'beans']
category2 = ['fruit', 'fruit', 'grains']
df = pd.DataFrame({'today':today, 'model_id': model_id, 'client_id':client_id,
'category1': category1, 'category2':category2})
df['today'] = pd.to_datetime(df['today'])
df
Function
def update_history(df, client_id, model_id, category1, category2):
today=pd.to_datetime('today')
# Create a temp dataframe with new values.
# Column names in this dataframe should match the existing dataframe
temp = pd.DataFrame({'today':[today], 'model_id': [model_id], 'client_id':[client_id],
'category1': [category1], 'category2':[category2]})
df = df.append(temp)
return df
Call function to append a row with new values to existing dataframe
update_history(df, client_id='C4', model_id='M4', category1='apple', category2='fruit')
You could try this. In case you are appending more than one row at a time, appending a dictionary to list and then appending them at once to a dataframe is faster.
modelid = ['MOD1', 'MOD2', 'MOD3']
today = ['2018-07-15', '2018-07-18', '2018-07-20']
clients = ['CLA', 'CLA', 'CLB']
cat_1 = ['CAT1', 'CAT2', 'CAT3']
cat_2 = ['CAT11', 'CAT12', 'CAT13']
mdf = pd.DataFrame({"model_id": modelid, "today": today, "client_id": clients, "cat_1":cat_1, "cat_2":cat_2})
def update_history(df, client_id, model_id):
today = pd.to_datetime('today')
row = df[df.model_id==model_id].iloc[0]
rows_list = []
dict = {"today":today, "client_id":client_id,
"model_id":model_id,"cat_1":row["cat_1"],
"cat_2":row["cat_2"]}
rows_list.append(dict)
df2 = pd.DataFrame(rows_list)
df = df.append(df2)
return df
mdf = update_history(mdf,"CLC","MOD1")
This is what I ended up doing. I still think there is a more elegant solution, so please let me know!
#create dataframe
modelid = ['MOD1', 'MOD2', 'MOD3']
today = ['2018-07-15', '2018-07-18', '2018-07-20']
clients = ['CLA', 'CLA', 'CLB']
cat_1 = ['CAT1', 'CAT2', 'CAT3']
cat_2 = ['CAT11', 'CAT12', 'CAT13']
mdf = pd.DataFrame({"model_id": modelid, "today": today, "client_id": clients, "cat_1":cat_1, "cat_2":cat_2})
#reorder columns
mdf = mdf[['cat_1', 'cat_2', 'model_id', 'client_id', 'today']]
#create lookup table
lookup=mdf[['cat_1','cat_2','model_id']]
lookup.drop_duplicates(inplace=True)
#get values
client_id = input("ENTER Client ID: ")
model_id = input("ENTER Model ID: ")
#append model id to list
model_id_lst=[]
model_id_lst.append(model_id)
today=pd.to_datetime('today')
#grab associated cat_1, and cat_2 from lookup table
temp=lookup[lookup['model_id'].isin(model_id_lst)]
out=temp.values.tolist()
out[0].extend([client_id, today])
#add this as a row to the df
mdf.loc[len(mdf)]=out[0]
I am planning to do some financial research and learning using data from the NASDAQ.
I want to retrieve data from Nasdaq such that the header has the following:
Stock Symbol
Company Name
Last Sale
Market Capitalization
IPO
Year
Sector
Industry
Last Update
And I used Python code to get the "list of companies and ticker names" using:
import pandas as pd
import json
PACKAGE_NAME = 'nasdaq-listings'
PACKAGE_TITLE = 'Nasdaq Listings'
nasdaq_listing = 'ftp://ftp.nasdaqtrader.com/symboldirectory/nasdaqlisted.txt'# Nasdaq only
def process():
nasdaq = pd.read_csv(nasdaq_listing,sep='|')
nasdaq = _clean_data(nasdaq)
# Create a few other data sets
nasdaq_symbols = nasdaq[['Symbol','Company Name']] # Nasdaq w/ 2 columns
# (dataframe, filename) datasets we will put in schema & create csv
datasets = [(nasdaq,'nasdaq-listed'), (nasdaq_symbols,'nasdaq-listed-symbols')]
for df, filename in datasets:
df.to_csv('data/' + filename + '.csv', index=False)
with open("datapackage.json", "w") as outfile:
json.dump(_create_datapackage(datasets), outfile, indent=4, sort_keys=True)
def _clean_data(df):
# TODO: do I want to save the file creation time (last row)
df = df.copy()
# Remove test listings
df = df[df['Test Issue'] == 'N']
# Create New Column w/ Just Company Name
df['Company Name'] = df['Security Name'].apply(lambda x: x.split('-')[0]) #nasdaq file uses - to separate stock type
#df['Company Name'] = TODO, remove stock type for otherlisted file (no separator)
# Move Company Name to 2nd Col
cols = list(df.columns)
cols.insert(1, cols.pop(-1))
df = df.loc[:, cols]
return df
def _create_file_schema(df, filename):
fields = []
for name, dtype in zip(df.columns,df.dtypes):
if str(dtype) == 'object' or str(dtype) == 'boolean': # does datapackage.json use boolean type?
dtype = 'string'
else:
dtype = 'number'
fields.append({'name':name, 'description':'', 'type':dtype})
return {
'name': filename,
'path': 'data/' + filename + '.csv',
'format':'csv',
'mediatype': 'text/csv',
'schema':{'fields':fields}
}
def _create_datapackage(datasets):
resources = []
for df, filename in datasets:
resources.append(_create_file_schema(df,filename))
return {
'name': PACKAGE_NAME,
'title': PACKAGE_TITLE,
'license': '',
'resources': resources,
}
process()
Now for each of these symbols, I want to get the other data (as in above).
Is there anyway I could do this?
Have you taken a look at pandas-datareader? You maybe able to get the other data from there. It has multiple data sources, such as Google, Yahoo Finance,
http://pandas-datareader.readthedocs.io/en/latest/remote_data.html#remote-data-google
How can i split by big dataframe into smaller dataframe and able to print all the dataframe separately on web? any idea on edit code can place a loop in context?
here is my code:
def read_raw_data(request):
Wb = pd.read_excel(r"LookAhead.xlsm", sheetname="Step")
Step1 = Wb.replace(np.nan, '', regex=True)
drop_column =
Step1_Result.drop(['facility','volume','indicator_product'], 1)
uniquevaluesproduct = np.unique(drop_column[['Product']].values)
total_count=drop_column['Product'].nunique()
row_array=[]
for name, group in drop_column.groupby('Product')
group=group.values.tolist()
row_array.append(group)
i=1
temp=row_array[0]
while i<total_count:
newb = temp + row_array[i]
temp=newb
i = i + 1
b = ['indicator', 'Product']
test=pd.DataFrame.from_records(temp, columns=b)
table = test.style.set_table_attributes('border="" class = "dataframe table table-hover table-bordered"').set_precision(10).render()
context = { "result": table}
return render(request, 'result.html', context)
If you want to show a big dataframe in different pages, I recommend you using a Paginator. The documentation has a good example on how to implement it.
https://docs.djangoproject.com/en/1.10/topics/pagination/#using-paginator-in-a-view
I'm getting data with two lists and I want to save both of them in one single json file can someone help me.
I'm using selenium
def get_name(self):
name = []
name = self.find_elements_by_class_name ('item-desc')
price = []
price = self.find_elements_by_class_name ('item-goodPrice')
for names in name :
names = (names.text)
#print names
for prices in price :
prices = (prices.text)
#print price
I would create a dictionary and then JSON dumps
An example could be:
import json
def get_name(self):
names = [ name.text for name in self.find_elements_by_class_name('item-desc') ]
prices = [ price.text for price in self.find_elements_by_class_name('item-goodPrice')]
with open('output-file-name.json', 'w') as f:
f.write(json.dumps({'names': names, 'prices': prices}))
EDIT: In the first version of the answer I was only creating the JSON, if you want to create a file as well, you should include what suggested by #Andersson comment
I'm new to programming and I've looked at previous answers to this question but none seem relevant to this specific query.
I'm learning to analyse data with python.
This is the code:
import pandas as pd
import os
os.chdir('/Users/Benjy/Documents/Python/Data Analysis Python')
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('ml-1m/users.dat', sep='::', header = None, names = unames)
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('ml-1m/ratings.dat', sep='::', header = None, names = rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('ml-1m/movies.dat', sep='::', header = None, names = mnames)
data = pd.merge(pd.merge(ratings, users), movies)
mean_ratings=data.pivot_table('ratings',rows='title', cols='gender',aggfunc='mean')
I keep getting an error saying mean_ratings is not defined...but surely it is defined in the last line of code above?
I think this will work: mean_ratings=data.pivot_table('rating',index='title',columns='gender',aggfunc='mean')