Getting an error with some elements
import pandas as pd
import numpy as np
from scipy.signal import argrelextrema
import matplotlib.pyplot as plt
import datetime
#Import our historical data
data = pd.read_csv('Data/sample.csv')
data.columns = [['Date','open','high','low','close','vol']]
data = data.drop_duplicates(keep=False)
data.Date = pd.to_datetime(data.Date,format='%Y.%m.%d %H:%M:%S.%f')
data = data.set_index(data.Date)
data = data[['open', 'high', 'close', 'vol']]
price = data.close.iloc[:100]
# Find our relative extrema
max_idx = argrelextrema(price.values,np.greater,order=1)
min_idx = argrelextrema(price.values,np.less,order=1)
print(max_idx)
print(min_idx)
The error is
Traceback (most recent call last):
File "untitled.py", line 9, in <module>
data.columns = [['Date','open','high','low','close','vol']]
ValueError: Length mismatch: Expected axis has 1 elements, new values have 6 elements
You want to pass a list, not a list of list or pandas will interpret the nested list as one column name.
data.columns = ['Date','open','high','low','close','vol']
Edit 1
Your CSV file seems to be separated by \t :
data = pd.read_csv('Data/sample.csv', sep=r'\t')
data.columns = ['Date','open','high','low','close','vol']
Related
I'm trying to solve Boston house price prediction problem,but it has this error
AttributeError: 'DataFrame' object has no attribute 'flush'
and this:
`
Cell In [53], line 7, in load_data()
5 def load_data():
6 datafile= pd.read_csv("housing.csv",sep=',')
----> 7 data = np.fromfile(datafile)
8 feature_names = ['RM', 'LSTAT', 'PTRATIO', 'MEDV']
9 feature_num = len(feature_names)
`
here's a part of my code
`
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
def load_data():
datafile= pd.read_csv("housing.csv",sep=',')
data = np.fromfile(datafile)
feature_names = ['RM', 'LSTAT', 'PTRATIO', 'MEDV']
feature_num = len(feature_names)
data = data.reshape(data.shape[0] // feature_num, feature_num)
ratio = 0.8
offset = int(data.shape[0] * ratio)
training = data[:offset]
maximums, minimums, avge = training.max(axis=0), training.min(axis=0), training.sum(axis=0) / training.shape[0]
`
the word "flush" doesn't appear in my code or in my data
can anyone give me some idea?
You are reading the housing.csv file with pd.read_csv, which converts it to a Dataframe object. This leads to the error, because np.fromfile expects a file (str or path), not a Dataframe.
To get rid of the error, replace the first to statements in the load_data function with a single suitable numpy function such as np.genfromtext.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
def load_data():
data = np.genfromtxt('housing.csv', delimiter=',')
feature_names = ['RM', 'LSTAT', 'PTRATIO', 'MEDV']
# [...]
I have the following code below.
import geopandas as gpd
import geopandas as gpd
import os
import rasterio
import scipy.sparse as sparse
import pandas as pd
import numpy as np
# Create an empty pandas dataframe called 'table'
table = pd.DataFrame(index = np.arange(0,1))
# Read the points shapefile using GeoPandas
stations = gpd.read_file(r'C......shp')
stations['lon'] = stations['geometry'].x
stations['lat'] = stations['geometry'].y
Matrix = pd.DataFrame()
# Iterate through the rasters and save the data as individual arrays to a Matrix
for files in os.listdir(r'C:\........\Test'):
if files[-4: ] == '.tif':
dataset = rasterio.open(r'C:.......\Test'+'\\'+files)
data_array = dataset.read(1)
data_array_sparse = sparse.coo_matrix(data_array, shape = (5001,5002))
data = files[ :-4]
Matrix[data] = data_array_sparse.toarray().tolist()
print('Processing is done for the raster: '+ files[:-4])
# Iterate through the stations and get the corresponding row and column for the related x, y
coordinates
for index, row in stations.iterrows():
station_name = str(row['Name'])
lon = float(row['lon'])
lat = float(row['lat'])
x,y = (lon, lat)
row, col = dataset.index(x, y)
print('Processing: '+ station_name)
# Pick the rainfall value from each stored raster array and record it into the previously created 'table'
for records_date in Matrix.columns.tolist():
a = Matrix[records_date]
rf_value = a.loc[int(row)][int(col)]
table[records_date] = rf_value
transpose_mat = table.T
transpose_mat.rename(columns = {0: 'Rainfall(mm)'}, inplace = True)
transpose_mat.to_csv(r'C:........'+'\\'+station_name+'.csv')
When I run the code I get the following error
Traceback (most recent call last):
File ~\Anaconda3\envs\geo_env\lib\site-packages\pandas\core\indexes\range.py:385 in get_loc
return self._range.index(new_key)
ValueError: 50003 is not in range
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
Input In [15] in <cell line: 1>
rf_value = a.loc[int(row)][int(col)]
File ~\Anaconda3\envs\geo_env\lib\site-packages\pandas\core\indexing.py:967 in __getitem__
return self._getitem_axis(maybe_callable, axis=axis)
File ~\Anaconda3\envs\geo_env\lib\site-packages\pandas\core\indexing.py:1202 in _getitem_axis
return self._get_label(key, axis=axis)
File ~\Anaconda3\envs\geo_env\lib\site-packages\pandas\core\indexing.py:1153 in _get_label
return self.obj.xs(label, axis=axis)
File ~\Anaconda3\envs\geo_env\lib\site-packages\pandas\core\generic.py:3864 in xs
loc = index.get_loc(key)
File ~\Anaconda3\envs\geo_env\lib\site-packages\pandas\core\indexes\range.py:387 in get_loc
raise KeyError(key) from err
KeyError: 50003
I have tried several solutions but I do not understand why it keeps showing up.
so I am pretty new at Python, and I am trying to load a dataset from my computer using scikit. This is what my code looks like:
**whatever.py**
import numpy as np
import csv
from sklearn.datasets.base import Bunch
class Cortex_nuc:
def cortex_nuclear():
with open('C:/Users/User/Desktop/Data_Cortex_Nuclear4.csv') as csv_file:
data_file = csv.reader(csv_file)
temp = next(data_file)
n_samples = int(float(temp[0]))
n_features = int(float(temp[1]))
data = np.empty((n_samples, n_features))
target = np.empty((n_samples,), dtype=np.float64)
for i, sample in enumerate(data_file):
data[i] = np.asarray(sample[:-1], dtype=np.float64)
target[i] = np.asarray(sample[-1], dtype=np.float64)
return Bunch(data=data, target=target)
so then I import it into my project:
from whatever import Cortex_nuc
and after that I try to save it into df:
df = Cortex_nuc.cortex_nuclear()
Btw, this is what the dataset looks like:
this is just a part of the dataset, otherwise it has 77 columns and about a thousand rows.
But I get an error message and I can't seem to figure out why it's happening. Here's the error message:
IndexError Traceback (most recent call last)
<ipython-input-5-a4935f2c187f> in <module>
----> 1 df = Cortex_nuc.cortex_nuclear()
~\whatever.py in cortex_nuclear()
20
21 for i, sample in enumerate(data_file):
---> 22 data[i] = np.asarray(sample[:-1], dtype=np.float64)
23 target[i] = np.asarray(sample[-1], dtype=np.float64)
24
IndexError: index 0 is out of bounds for axis 0 with size 0
Can someone please help me? Thanks!
If you want to create a "sklearn-like" dataset in a Bunch object, you probably want something like this:
import pandas as pd
import numpy as np
from sklearn.utils import Bunch
# For reproducing
from io import StringIO
csv_file = StringIO("""
target,A,B
0,0,0
1,0,1
1,1,0
0,1,1
""")
def load_xor(*, return_X_y=False):
"""Describe your data here."""
_data_file = pd.read_csv(csv_file)
_data = Bunch()
_data["DESCR"] = load_xor.__doc__
_data["data"] = _data_file[["A", "B"]].to_numpy(dtype=np.float64)
_data["target"] = _data_file["target"].to_numpy(dtype=np.float64)
_data["target_names"] = np.array(["false", "true"])
_data["feature_names"] = np.array(list(_data_file.drop(["target"], axis=1)))
if return_X_y:
return _data.data, _data.target
return _data
if __name__ == "__main__":
# Return and unpack the `X`, `y` tuple
X, y = load_xor(return_X_y=True)
print(X, y)
This is because sklearn.datasets typically return Bunch objects with specific attributes/keys (for explanations, see the "Return" section of the load_iris documentation):
>>> from sklearn.datasets import load_iris
>>> data = load_iris()
>>> dir(data)
['DESCR', 'data', 'feature_names', 'filename', 'frame', 'target', 'target_names']
I am running error saying: IndexError: list index out of range
when running this code:
import pandas as pd
import pandas_datareader as wb
import datetime as dt
data = wb.DataReader('spy', 'yahoo', start='1/1/1978', end='30/10/2019')
data['Change'] = data['Close'].pct_change() * 100
data['Gaps'] = (((data['Open'] - data['Close'].shift(1))/data['Close'].shift(1)) * 100)
data['Gaps'].astype(float)
data['Performance during day'] = ((data['Close'] - data['Open'])/data['Open']) * 100
data.reset_index(inplace=True)
data['Date'] = data['Date'].dt.date
data = round(data, 2)
filtered_data = list((data[data['Gaps'] > 2].index.astype(int)))
list_of_slices = []
for each in filtered_data:
event = data.iloc[filtered_data[each]-30:filtered_data[each]+60]
list_of_slices.append(event)
I want to extract part of the Dataframe and create new Sub dataframe from data extracted to plot afterwards candlestick chart
I am using the playerStat.csv which includes 8 columns from which I only need 2. So I`m trying to create a new DataFrame with only those 2 columns.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dataset = pd.read_csv("HLTVData/playerStats.csv")
dataset.head(20)
I only need the ADR and the Rating.
So I first create a matrix with the data set.
mat = dataset.as_matrix()
#4 is the ADR and 6 is the Rating
newDAtaSet = pd.DataFrame(dataset, index=indexMatrix,columns=(mat[:,4],mat[:,6]) )
But it didn`t work, it threw an exception
NameError Traceback (most recent call last)
<ipython-input-10-1f975cc2514a> in <module>()
1 #4 is the ADR and 6 is the Rating
----> 2 newDataSet = pd.DataFrame(dataset, index=indexMatrix,columns=(mat[:,4],mat[:,6]) )
NameError: name 'indexMatrix' is not defined
I also tried using the dataset.
newDataSet = pd.DataFrame(dataset, index=np.array(range(dataset.shape[0])), columns=dataset['ADR'])
/home/tensor/miniconda3/envs/tensorflow35openvc/lib/python3.5/site-packages/pandas/core/internals.py in _make_na_block(self, placement, fill_value)
3984
3985 dtype, fill_value = infer_dtype_from_scalar(fill_value)
-> 3986 block_values = np.empty(block_shape, dtype=dtype)
3987 block_values.fill(fill_value)
3988 return make_block(block_values, placement=placement)
MemoryError:
I think you need parameter usecols in read_csv:
dataset = pd.read_csv("HLTVData/playerStats.csv", usecols=['ADR','Rating'])
Or:
dataset = pd.read_csv("HLTVData/playerStats.csv", usecols=[4,6])