How can I convert a 1d array to a 2d array? - python

I am running the code below.
import datetime
import pandas as pd
import numpy as np
import pylab as pl
import datetime
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from matplotlib.collections import LineCollection
from pandas_datareader import data as wb
from sklearn import cluster, covariance, manifold
###############################################################################
start = '2019-02-01'
end = '2020-02-01'
tickers = ['MMM',
'ABT',
'ABBV',
'ABMD',
'ACN',
'ATVI']
thelen = len(tickers)
price_data = []
for ticker in tickers:
prices = wb.DataReader(ticker, start = start, end = end, data_source='yahoo')[['Open','Adj Close']]
price_data.append(prices.assign(ticker=ticker)[['ticker', 'Open', 'Adj Close']])
df = pd.concat(price_data)
df.rename(columns = {'ticker':'Ticker', 'Adj Close':'Close'}, inplace = True)
df.dtypes
df.head()
df.shape
#df.reset_index()
pd.set_option('display.max_columns', 500)
open = np.array([df.Open]).astype(np.float)
close = np.array([df.Close]).astype(np.float)
# The daily variations of the quotes are what carry most information
variation = (close - open)
The code above gives me this 1d array, here.
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
0 0.38 0.93 0.3 0.72 -0.42 0.37 0.36 0.71 0.89 -0.32 0.11 -0.06 -0.17 0.4 0.25 -0.48 0.1 -0.29 -0.29 -0.38 0.21 0.22 0.11 -0.01 -0.07 -0.66 0 -0.78 0.24 -0.89 0.07
My desired output would be a 2d array, like this.
0 1 2 3 4 5 6 7 8 9 10
0 0.38 0.93 0.3 0.72 -0.42 0.37 0.36 0.71 0.89 -0.32 0.11
1 0.61 0.18 0.63 0.02 -0.03 -0.27 -0.75 -1 0.48 -0.74 -0.34
2 1.77 0.95 1.69 2.05 -1.36 2.25 1.83 -0.8 1.35 -0.99 -1.35
3 0.7 -0.12 0.32 -0.14 -0.53 0.63 0.85 0.46 0.23 -0.83 0.59
4 1.71 -0.8 0.74 -0.58 -1.2 0.38 0.35 0.06 0.56 -0.38 0.64
5 0.47 0.25 0.93 -0.9 -0.15 0.64 -0.11 -0.09 0.44 -0.47 -0.09
How can I change my 1d array to a 2d array, with the difference between open and close horizontal, and different stock open-close vertical? Thanks?

I actually got this to work. Apparently you have to store items in a list rather than a dataframe.
import datetime
import pandas as pd
import numpy as np
import pylab as pl
import datetime
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from matplotlib.collections import LineCollection
from pandas_datareader import data as wb
from sklearn import cluster, covariance, manifold
start = '2019-02-01'
end = '2020-02-01'
tickers = ['AXP',
'AAPL',
'BA',
'CAT',
'CSCO',
'CVX',
'XOM',
'GS',
'HD',
'IBM',
'INTC',
'JNJ',
'KO',
'JPM',
'MCD',
'MMM',
'MRK',
'MSFT',
'NKE',
'PFE',
'PG',
'TRV',
'UNH',
'UTX',
'VZ',
'V',
'WBA',
'WMT',
'DIS']
thelen = len(tickers)
price_data = []
for ticker in tickers:
prices = wb.DataReader(ticker, start = start, end = end, data_source='yahoo')[['Open','Adj Close']]
price_data.append(prices.assign(ticker=ticker)[['ticker', 'Open', 'Adj Close']])
#names = np.reshape(price_data, (len(price_data), 1))
names = pd.concat(price_data)
names.reset_index()
#pd.set_option('display.max_columns', 500)
open = np.array([q['Open'] for q in price_data]).astype(np.float)
close = np.array([q['Adj Close'] for q in price_data]).astype(np.float)
#close_prices = np.array([q.close for q in quotes]).astype(np.float)
# The daily variations of the quotes are what carry most information
variation = (close - open)
# pd.DataFrame(variation).to_csv("C:\\path\\file.csv")
# Learn a graphical structure from the correlations
edge_model = covariance.GraphicalLassoCV()
X = variation
# standardize the time series: using correlations rather than covariance
# is more efficient for structure recovery
X = variation.copy().T
X /= X.std(axis=0)
edge_model.fit(X)
# Cluster using affinity propagation
_, labels = cluster.affinity_propagation(edge_model.covariance_)
n_labels = labels.max()
details = [(name,cluster) for name, cluster in zip(tickers,labels)]
for detail in details:
print(detail)

Related

Initial value of multiple variables dataframe for time dilation

Dataframe:
product1
product2
product3
product4
product5
straws
orange
melon
chair
bread
melon
milk
book
coffee
cake
bread
melon
coffe
chair
book
CountProduct1
CountProduct2
CountProduct3
Countproduct4
Countproduct5
1
1
1
1
1
2
1
1
1
1
2
3
2
2
2
RatioProduct1
RatioProduct2
RatioProduct3
Ratioproduct4
Ratioproduct5
0.28
0.54
0.33
0.35
0.11
0.67
0.25
0.13
0.11
0.59
2.5
1.69
1.9
2.5
1.52
I want to create five others columns that keep my initial ratio of each item along the dataframe.
Output:
InitialRatio1
InitialRatio2
InitialRatio3
InitialRatio4
InitialRatio5
0.28
0.54
0.33
0.35
0.11
0.33
0.25
0.13
0.31
0.59
0.11
0.33
0.31
0.35
0.13
Check the code again. Do you have an error in product3 = coffe and product4 = coffee? Fixed coffe to coffee. As a result, 0.31 should not be.
import pandas as pd
pd.set_option('display.max_rows', None) # print everything rows
pd.set_option('display.max_columns', None) # print everything columns
df = pd.DataFrame(
{
'product1':['straws', 'melon', 'bread'],
'product2':['orange', 'milk', 'melon'],
'product3':['melon', 'book', 'coffee'],
'product4':['chair', 'coffee', 'chair'],
'product5':['bread', 'cake', 'book'],
'time':[1,2,3],
'Count1':[1,2,2],
'Count2':[1,1,3],
'Count3':[1,1,2],
'Count4':[1,1,2],
'Count5':[1,1,2],
'ratio1':[0.28, 0.67, 2.5],
'ratio2':[0.54, 0.25, 1.69],
'ratio3':[0.33, 0.13, 1.9],
'ratio4':[0.35, 0.11, 2.5],
'ratio5':[0.11, 0.59, 1.52],
})
print(df)
product = df[['product1', 'product2', 'product3', 'product4', 'product5']].stack().reset_index()
count = df[['Count1', 'Count2', 'Count3', 'Count4', 'Count5']].stack().reset_index()
ratio = df[['ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5']].stack().reset_index()
print(ratio)
arr = pd.unique(product[0])
aaa = [i for i in range(len(arr)) if product[product[0] == arr[i]].count()[0] > 1]
for i in aaa:
prod_ind = product[product[0] == arr[i]].index
val_ratio = ratio.loc[prod_ind[0], 0]
ratio.loc[prod_ind, 0] = val_ratio
print(ratio.pivot_table(index='level_0', columns='level_1', values=[0]))
Output:
level_1 ratio1 ratio2 ratio3 ratio4 ratio5
level_0
0 0.28 0.54 0.33 0.35 0.11
1 0.33 0.25 0.13 0.11 0.59
2 0.11 0.33 0.11 0.35 0.13
To work with data, they need to be turned into one column using stack().reset_index(). Create a list of unique products arr. Further in the list aaa I get indexes of arr, which are more than one.
prod_ind = product[product[0] == arr[i]].index
In a loop, I get indexes of products that are more than one.
val_ratio = ratio.loc[prod_ind[0], 0]
Get the first value of the product.
ratio.loc[prod_ind, 0] = val_ratio
Set this value for all products.
To access the values, explicit loc indexing is used, where the row indices are in square brackets on the left, and the names of the columns on the right. Read more here.
In pivot_table I create back the table.
To insert the processed data into the original dataframe, simply use the following:
table = ratio.pivot_table(index='level_0', columns='level_1', values=[0])
df[['ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5']] = table
print(df)
If you're after code to create the init_rateX columns then the following will work
pd.DataFrame(
np.divide(
df[["ratio1", "ratio2", "ratio3", "ratio4", "ratio5"]].to_numpy(),
df[["Count1", "Count2", "Count3", "Count4", "Count5"]].to_numpy(),
),
columns=["init_rate1", "init_rate2", "init_rate3", "init_rate4", "init_rate5"],
)
which gives
init_rate1 init_rate2 init_rate3 init_rate4 init_rate5
0 0.28 0.25 0.33 0.57 0.835
1 0.33 0.13 0.97 0.65 0.760
2 0.54 0.11 0.45 0.95 1.160
3 0.35 0.59 0.34 1.25 1.650
However it does not agree with your calcs for init_rate4 or init_rate5 so some clarification might be needed.

How to visualise means with Seaborn?

I have a Pandas data frame with the following structure:
alpha beta gamma mse
0 0.00 0.00 0.00 0.000000
1 0.05 0.05 0.90 0.025411
2 0.05 0.10 0.85 0.025794
3 0.05 0.15 0.80 0.026289
4 0.05 0.20 0.75 0.025320
.. ... ... ... ...
148 0.75 0.05 0.20 0.026816
149 0.75 0.10 0.15 0.025817
150 0.75 0.15 0.10 0.025702
151 0.80 0.05 0.15 0.027104
152 0.80 0.10 0.10 0.025936
I would like to visualise the data frame with a heatmap where alpha is represented on the x-axis, beta is represented on the y-axis, and for each square of the lattice, the mean MSE over all gammas is computed. Is there an easy way to do this by using Seaborn?
Thanks in advance.
For what you showed, yes, you can do with:
sns.heatmap(df.pivot_table(index='beta', columns='alpha', values='mse'))
All the calculation should be done in your DataFrame.
Once you have the data, you could use pivoted DataFrame to build the heatmap
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Assuming that you have the df variable with your data
# pivot the data
pivoted = df.pivot('alpha', 'beta', 'mse')
# plot the heatmap
sns.heatmap(pivoted, annot=True)
plt.show()
More information in the official documentation: https://seaborn.pydata.org/generated/seaborn.heatmap.html

Matplotlib like graphs with plotly express

Following is my Pandas dataframe, its very easy creating a line plot for all the items with matplotlib. I just write
df.plot()
And it create a separate line for all the items, But I want to create same line plots with plotly express, But I am not able to do it, may be because I have date columns
df;
dataDate 2019-10-01 2019-10-02 2019-10-01 2019-10-01 2019-10-02
name
item1 0.24 0.12 0.19 0.20 0.12
item2 0.26 0.25 0.17 0.17 0.13
item3 0.22 0.24 0.18 0.17 0.16
item4 0.72 0.22 0.19 0.20 0.15
item5 0.55 0.23 0.19 0.18 0.14
Suggest me how I can create line plots for all the items across the time with plotly express. Thanks
They have great examples on their documentation (https://plot.ly/python/plotly-express/#scatter-and-line-plots).
By design it works best with tidy data so you would have a column for Date, a column for Item Number, and then a column for the value.
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
base = datetime.today()
dates = [base - timedelta(days=x) for x in range(10)] * 3
cats = ['A'] * 10 + ['B'] * 10 + ['C'] * 10
vals = np.arange(30)
df = pd.DataFrame({'Date': dates, 'Category': cats, 'Value': vals})
px.line(df, x='Date', y='Value', color='Category')

group and divide values ​in python

What I want to make records that have the same "NROCUENTA", create a column where the result of the first "SALDO" divided by the number of records in that group
import pandas as pd
import csv, sys
try:
file_encoding = 'utf8'
input_fd = open('DAT_210.del', encoding=file_encoding)
df = pd.read_csv(input_fd, sep = ' ', quotechar='"', error_bad_lines=False)
result=df.groupby('NROCUENTA').apply(
lambda x: ................................
)
except csv.Error as e:
sys.exit('file {}, line {}: {}'.format("datahist.del", reader.line_num, e))
resutl2=result.to_csv('result001.csv',mode = 'w', index=False )
SALDO=FIRST(SALDO)/COUNT(NROCUENTA)
DATA
"NROCUENTA" "SALDO"
"210-1-388" 159.20
"210-1-388" 159.20
"210-1-1219" 0.93
"210-1-11657" 0.06
"210-1-11657" 0.06
"210-1-11657" 0.06
RESULT
"210-1-388" 79.6
"210-1-388" 79.6
"210-1-1219" 0.93
"210-1-11657" 0.02
"210-1-11657" 0.02
"210-1-11657" 0.02
TRIED
I was trying with the dfply library, but it throws errors at me and I decided to do it with pandas
IIUC, you need transform with count and divide it by SALDO columns. I assign result to column AVG_SALDO
df['AVG_SALDO'] = df['SALDO'] / df.groupby('NROCUENTA').SALDO.transform('count')
Out[1112]:
NROCUENTA SALDO AVG_SALDO
0 210-1-388 159.20 79.60
1 210-1-388 159.20 79.60
2 210-1-1219 0.93 0.93
3 210-1-11657 0.06 0.02
4 210-1-11657 0.06 0.02
5 210-1-11657 0.06 0.02

Python: plot timedelta and cumulative values

I have a dataframe with 1000 rows like below
start_time val
0 15:16:25 0.01
1 15:17:51 0.02
2 15:26:16 0.03
3 15:27:28 0.04
4 15:32:08 0.05
5 15:32:35 0.06
6 15:33:02 0.07
7 15:33:46 0.08
8 15:33:49 0.09
9 15:34:04 0.10
10 15:34:23 0.11
11 15:34:32 0.12
12 15:34:32 0.13
13 15:35:53 0.14
14 15:37:31 0.15
15 15:38:11 0.16
16 15:38:17 0.17
17 15:38:29 0.18
18 15:40:07 0.19
19 15:40:32 0.20
20 15:40:53 0.21
... .... ..
I would like to plot it, with the the time on the x axis. I have used
plt.plot(df['start_time'].dt.total_seconds(),df['val'])
# generate a formatter, using the fields required
fmtr = mdates.DateFormatter("%H:%M")
# need a handle to the current axes to manipulate it
ax = plt.gca()
# set this formatter to the axis
ax.xaxis.set_major_formatter(fmtr)
And it works fine, but on the x axis I have labels which are not showing correct time, see below:
Any help? thank you in advance
You can convert timedeltas to seconds:
plt.plot(df['start_time'].dt.total_seconds(),df['val'])
Solution for converting timedeltas to strings from here, only necessary convert nanoseconds to seconds:
import datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(df['start_time'], df['val'])
def timeTicks(x, pos):
seconds = x / 10**9
d = datetime.timedelta(seconds=seconds)
return str(d)
formatter = matplotlib.ticker.FuncFormatter(timeTicks)
ax.xaxis.set_major_formatter(formatter)
plt.xticks(rotation=90)
plt.show()

Categories

Resources