Issues reading with pandas.read_table - python

I am trying to read in and manipulate some text files I have as outputs with statistics from an MRI analysis I ran.
With a for loop I would like to index into each subjects folder and convert their txt file with summary statistics into a data_frame, drop some of the rows that are not necessary, and concatenate each subject's now cleaned data_frame with a master data_frame. I seem to be reading in the txt file and getting it into the data_frame. However I am running into two issues I can't troubleshoot when trying to drop rows.
The txt file is organized like so.....
# Title Pathway Statistics
#
# generating_program
/cell_root/software/freesurfer/6.0.0/sys/bin/dmri_pathstats
# cvs_version
Count 2000
Volume 98
Len_Min 67
Len_Max 92
Len_Avg 81.219
Len_Center 87
AD_Avg 0.00152315
AD_Avg_Weight 0.00151198
AD_Avg_Center 0.00141413
Although there is one row with many spaces that might be related to the issue in terms of reading the data in?
# cmdline
/cell_root/software/freesurfer/6.0.0/sys/bin/dmri_pathstats --intrc
/homes/dcallow/dti_freesurf/trac/Ex.AES115.long.base_AES115/dpath/fmajor_PP_avg33_mni_bbr --dtbase
/homes/dcallow/dti_freesurf/trac/Ex.AES115.long.base_AES115/dmri/dtifit --path fmajor --subj Ex.AES115.long.base_AES115 --out
/homes/dcallow/dti_freesurf/trac/Ex.AES115.long.base_AES115/dpath/fmajor_PP_avg33_mni_bbr/pathstats.overall.txt --outvox
/homes/dcallow/dti_freesurf/trac/Ex.AES115.long.base_AES115/dpath/fmajor_PP_avg33_mni_bbr/pathstats.byvoxel.txt
I tried removing the drop line however I then get a different error that occurs earlier in the code which is also odd?
Traceback (most recent call last):
File "./txt_2_excel.sh", line 18, in
df=pd.read_table('pathstats.overall.txt', delim_whitespace=True,names=['measure','value','excess1','excess2','excess3','excess4'])
File "/Users/amos/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py",
line 678, in parser_f
return _read(filepath_or_buffer, kwds)
File "/Users/amos/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py",
line 440, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/Users/amos/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py",
line 787, in init
self._make_engine(self.engine)
File "/Users/amos/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py",
line 1014, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/Users/amos/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py",
line 1708, in init
self._reader = parsers.TextReader(src, **kwds)
File "pandas/_libs/parsers.pyx", line 384, in pandas._libs.parsers.TextReader.cinit
File "pandas/_libs/parsers.pyx", line 695, in pandas._libs.parsers.TextReader._setup_parser_source
FileNotFoundError: File b'pathstats.overall.txt' does not exist
#!/Users/amos/anaconda3/bin/python
# Pythono3 code to rename multiple
# files in a directory or folder
# importing os module
import os
import pandas as pd
#set working directory to where files are stored
os.chdir("/Volumes/DANIEL/trac_stats")
df_master = pd.DataFrame()
for tract in os.listdir("/Volumes/DANIEL/tract_names/"):
for subj in os.listdir("/Volumes/DANIEL/trac/"):
os.chdir("/Volumes/DANIEL/trac/{0}/dpath/{1}/".format(subj,tract))
os.getcwd()
df=pd.read_table('pathstats.overall.txt', delim_whitespace=True,names=['measure','value','excess1','excess2','excess3','excess4'])
df=df.drop([0,1,2,3,4,5,6,7,8,9,10,11,14,15,16,17,18,20,22,23,25,26,28,29,31,32])
df['subj']=subj
df_master=pd.concat([df_master,df])
print(df_master)
os.chdir("/Volumes/DANIEL/trac_stats/")
df_master.to_excel('trac_stats')
This should produce an excel sheet with a ['measure','value','excess1','excess2','excess3','excess4'] column and rows 12,13,19,21,24,and 27 of data for each subject in a excel sheet named trac_stats.
I get the following error
File "./txt_2_excel.sh", line 20, in File
"/Users/amos/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py",
line 3697, in drop errors=errors) File
"/Users/amos/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py",
line 3111, in drop obj = obj._drop_axis(labels, axis, level=level,
errors=errors) File
"/Users/amos/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py",
line 3143, in _drop_axis new_axis = axis.drop(labels, errors=errors)
File
"/Users/amos/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py",
line 4404, in drop '{} not found in axis'.format(labels[mask]))
KeyError: '[32] not found in axis'

Related

pandas read_csv error when loading a plain CSV file

I am having this very weird error with python pandas:
import pandas as pd
df = pd.read_csv('C:\Temp\test.csv', index_col=None, comment='#', sep=',')
The test.csv is a very simple CSV file created in Notepad:
aaa,bbb,date
hhhhh,wws,20220701
Now I get the error:
File "C:\test\untitled0.py", line 10, in <module>
df = pd.read_csv('C:\temp\test.csv', index_col=None, comment='#', sep=',')
File "C:\...\lib\site-packages\pandas\util\_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "C:\...\lib\site-packages\pandas\io\parsers\readers.py", line 586, in read_csv
return _read(filepath_or_buffer, kwds)
File "C:\...\lib\site-packages\pandas\io\parsers\readers.py", line 482, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "C:\...\lib\site-packages\pandas\io\parsers\readers.py", line 811, in __init__
self._engine = self._make_engine(self.engine)
File "C:\...\lib\site-packages\pandas\io\parsers\readers.py", line 1040, in _make_engine
return mapping[engine](self.f, **self.options) # type: ignore[call-arg]
File "C:\...\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py", line 51, in __init__
self._open_handles(src, kwds)
File "C:\...\lib\site-packages\pandas\io\parsers\base_parser.py", line 229, in _open_handles
errors=kwds.get("encoding_errors", "strict"),
File "C:\...\lib\site-packages\pandas\io\common.py", line 707, in get_handle
newline="",
OSError: [Errno 22] Invalid argument: 'C:\temp\test.csv'
I also tried to use Excel to export a CSV file, and get the same error.
Does anyone know what goes wrong?
In a python string, the backslash in '\t' is an escape character which causes those two characters ( \ followed by t) to mean tab. You can get around this using raw strings by prefacing the opening quote with the letter 'r':
r'C:\Temp\test.csv'

How can I concatenate 3 large tweet dataframe (csv) files each having approximately 5M tweets?

I have three csv dataframes of tweets, each ~5M tweets. The following code for concatenating them exists with low memory error. My machine has 32GB memory. How can I assign more memory for this task in pandas?
df1 = pd.read_csv('tweets.csv')
df2 = pd.read_csv('tweets2.csv')
df3 = pd.read_csv('tweets3.csv')
frames = [df1, df2, df3]
result = pd.concat(frames)
result.to_csv('tweets_combined.csv')
The error is:
$ python concantenate_dataframes.py
sys:1: DtypeWarning: Columns (0,1,2,3,4,5,6,8,9,10,11,12,13,14,19,22,23,24) have mixed types.Specify dtype option on import or set low_memory=False.
Traceback (most recent call last):
File "concantenate_dataframes.py", line 19, in <module>
df2 = pd.read_csv('tweets2.csv')
File "/home/mona/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 676, in parser_f
return _read(filepath_or_buffer, kwds)
File "/home/mona/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 454, in _read
data = parser.read(nrows)
File "/home/mona/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 1133, in read
ret = self._engine.read(nrows)
File "/home/mona/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 2037, in read
data = self._reader.read(nrows)
File "pandas/_libs/parsers.pyx", line 859, in pandas._libs.parsers.TextReader.read
UPDATE: tried the suggestions in the answer and still get error
$ python concantenate_dataframes.py
Traceback (most recent call last):
File "concantenate_dataframes.py", line 18, in <module>
df1 = pd.read_csv('tweets.csv', low_memory=False, error_bad_lines=False)
File "/home/mona/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 676, in parser_f
return _read(filepath_or_buffer, kwds)
File "/home/mona/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 454, in _read
data = parser.read(nrows)
File "/home/mona/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 1133, in read
ret = self._engine.read(nrows)
File "/home/mona/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py", line 2037, in read
data = self._reader.read(nrows)
File "pandas/_libs/parsers.pyx", line 862, in pandas._libs.parsers.TextReader.read
File "pandas/_libs/parsers.pyx", line 943, in pandas._libs.parsers.TextReader._read_rows
File "pandas/_libs/parsers.pyx", line 2070, in pandas._libs.parsers.raise_parser_error
pandas.errors.ParserError: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.
File "pandas/_libs/parsers.pyx", line 874, in pandas._libs.parsers.TextReader._read_low_memory
File "pandas/_libs/parsers.pyx", line 928, in pandas._libs.parsers.TextReader._read_rows
File "pandas/_libs/parsers.pyx", line 915, in pandas._libs.parsers.TextReader._tokenize_rows
File "pandas/_libs/parsers.pyx", line 2070, in pandas._libs.parsers.raise_parser_error
pandas.errors.ParserError: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.
I am running the code on Ubuntu 20.04 OS
I think this is problem with malformed data (some data not structure properly in tweets2.csv) for that you can use error_bad_lines=False and try to chnage engine from c to python like engine='python'
ex : df2 = pd.read_csv('tweets2.csv', error_bad_lines=False)
or
ex : df2 = pd.read_csv('tweets2.csv', engine='python')
or maybe
ex : df2 = pd.read_csv('tweets2.csv', engine='python', error_bad_lines=False)
but I recommand to identify those revord and repair that.
And also if you want hacky way to do this than use
1) https://askubuntu.com/questions/941480/how-to-merge-multiple-files-of-the-same-format-into-a-single-file
2) https://askubuntu.com/questions/656039/concatenate-multiple-files-without-headerenter link description here
Specify dtype option on import or set low_memory=False

How to get write custom tables in new file from csv file

I have writed a script to extract table from csv file and write a new csv file that contains this table.
So I have this code below :
import csv
import pandas as pd
with open("C:\\OpenFace\\x64\\Release\\processed\\webcam_2019-04-22-1552.csv") as csvfile:
ddf= pd.read_table(csvfile,sep=" ")
first_letters = ['eye']
headers = ddf.dtypes.index
df= pd.read_table(csvfile,sep=" ",names=[name for name in headers if (name[0] in first_letters)])
print(df)
I'm trying to get only columns names who start from eye ,
but i get this error :
Traceback (most recent call last):
File "getpoints.py", line 8, in <module>
df= pd.read_table(csvfile,sep=" ",names=[name for name in headers if
(name[0] in first_letters)])
File "C:\Python36\lib\site-packages\pandas\io\parsers.py", line 678, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Python36\lib\site-packages\pandas\io\parsers.py", line 440, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "C:\Python36\lib\site-packages\pandas\io\parsers.py", line 787, in __init__
self._make_engine(self.engine)
File "C:\Python36\lib\site-packages\pandas\io\parsers.py", line 1014, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "C:\Python36\lib\site-packages\pandas\io\parsers.py", line 1708, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas\_libs\parsers.pyx", line 542, in pandas._libs.parsers.TextReader.__cinit__
pandas.errors.EmptyDataError: No columns to parse from file
How to solve this?
Any ideas?
Thanks.
import csv
import pandas as pd
#Read the only the header, i.e column names and breaks the execution
#as only the column names is to be fetched.
with open("C:/path/to/.csv", "rb") as f:
reader = csv.reader(f)
columns = reader.next()
break
columns = list(filter(lambda x: x.startswith("eye"), columns))
df = pd.read_csv("C:/path/to/.csv", sep=" ", names=columns)

I need to make a scatterplot from a csv file containg date on the x axis and time on the y axis, how do i code this?

I need to create a scatterplot that contains the Date on the X axis and the Time on the Y axis. The date looks like (4/10/2019) and the time looks like (23:55:00) if this matters.
I have tried the following code.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("Crimes_-_2001_to_present.csv")
plt.scatter(df["Date_1"],df["Time_1"])
plt.xlabel('Date', fontsize=16)
plt.ylabel('Time', fontsize=16)
plt.title('Occurence of Crime in Relation to Time',fontsize=20)
plt.show()
My error message:
====================== RESTART: F:\scatter plot code.py ======================
Traceback (most recent call last):
File "F:\scatter plot code.py", line 6, in <module>
df = pd.read_csv("Crimes_-_2001_to_present.csv")
File "C:\Users\Andrew\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\parsers.py", line 702, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Users\Andrew\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\parsers.py", line 429, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "C:\Users\Andrew\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\parsers.py", line 895, in __init__
self._make_engine(self.engine)
File "C:\Users\Andrew\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\parsers.py", line 1122, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "C:\Users\Andrew\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\parsers.py", line 1853, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas\_libs\parsers.pyx", line 387, in pandas._libs.parsers.TextReader.__cinit__
File "pandas\_libs\parsers.pyx", line 705, in pandas._libs.parsers.TextReader._setup_parser_source
FileNotFoundError: [Errno 2] File b'Crimes_-_2001_to_present.csv' does not exist: b'Crimes_-_2001_to_present.csv'
I am totally lost, I appreciate any help!
I expected a scatterplot.
You can also try including the full path of your csv file.
df = pd.read_csv("THE FULL PATH TO YOUR FILE")
First, check your .csv file name is correct;
Then try put your .csv file "Crimes_-_2001_to_present.csv" and your .py code in the same folder;
Run;
if not ok: change your .csv file name to simple way, such as "Crimes2001.csv";
.py file change by: df = pd.read_csv(r"Crimes2001.csv");
Run again, then it must be ok!

Declaring path in pycharm not working

I am using Pycharm and I created the project in a folder called collaborative filtering. I have some csv in a folder called ml-latest-small that I also placed in the collaborative filtering folder that has the .py file I am working from.
I am getting the following errors:
Traceback (most recent call last):
File "/Users/usernamehere/Desktop/Machine Learning/Lesson 5/CollaborativeFiltering/movies.py", line 32, in <module>
cf = CollabFilterDataset.from_csv(path, 'ratings.csv', 'userId', 'movieId', 'rating')
File "/Users/usernamehere/Desktop/Machine Learning/Lesson 5/CollaborativeFiltering/venv/lib/python3.6/site-packages/fastai/column_data.py", line 146, in from_csv
df = pd.read_csv(os.path.join(path,csv))
File "/Users/usernamehere/Desktop/Machine Learning/Lesson 5/CollaborativeFiltering/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 709, in parser_f
return _read(filepath_or_buffer, kwds)
File "/Users/usernamehere/Desktop/Machine Learning/Lesson 5/CollaborativeFiltering/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 449, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/Users/usernamehere/Desktop/Machine Learning/Lesson 5/CollaborativeFiltering/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 818, in __init__
self._make_engine(self.engine)
File "/Users/usernamehere/Desktop/Machine Learning/Lesson 5/CollaborativeFiltering/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 1049, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/Users/usernamehere/Desktop/Machine Learning/Lesson 5/CollaborativeFiltering/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 1695, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas/_libs/parsers.pyx", line 402, in pandas._libs.parsers.TextReader.__cinit__
File "pandas/_libs/parsers.pyx", line 718, in pandas._libs.parsers.TextReader._setup_parser_source
FileNotFoundError: File b'/Users/usernamehere/Users/usernamehere/Desktop/Machine Learning/Lesson 5/ratings.csv' does not exist
I am not sure what is wrong with the way I am declaring the path. Here is my code:
import torch
from fastai.learner import *
from fastai.column_data import *
path = '~/Users/usernamehere/Desktop/Machine Learning/Lesson 5'
ratings = pd.read_csv(path+'ratings.csv')
#print(ratings.head())
movies = pd.read_csv(path+'movies.csv')
#print(movies.head())
# Crete a subset for Excel
g = ratings.groupby('userId')['rating'].count()
topUsers = g.sort_values(ascending=False)[:15]
g = ratings.groupby('movieId')['rating'].count()
topMovies = g.sort_values(ascending=False)[:15]
top_r = ratings.join(topUsers, rsuffix='_r', how='inner', on='userId')
top_r = top_r.join(topMovies, rsuffix='_r', how='inner', on='movieId')
# pd.crosstab(top_r.userId, top_r.movieId, top_r.rating, aggfunc=np.sum)
# Collaborative Filtering - High Level
# Get a valisation indexes
val_idxs = get_cv_idxs(len(ratings))
wd = 2e-4
n_factors = 50
cf = CollabFilterDataset.from_csv(path, 'ratings.csv', 'userId', 'movieId', 'rating')
Edit:
Changing the path to path='ml-latest-small/' seemed to work.
Since you are on a *nix-based system, I would recommend you escape your spaces with \. Here's a simple test on Mac to show a scenario with and without escaping:
$ pwd
/tmp
$ mkdir "Machine Learning"
$ cd Machine Learning
-bash: cd: Machine: No such file or directory
$ cd Machine\ Learning
$ pwd
/tmp/Machine Learning
Here, the ~ means $HOME (read here):
which is why you end up with:
/Users//Users/usernamehere/Desktop/Machine Learning/Lesson 5/ratings.csv' which is not a valid path.

Categories

Resources