trouble looping xarray dataframe through subdirectories - python

I am trying to make a big data frame by looping through sub-directories. I want to:
i) read data from all the files (with .nc extension) in the subdirectories,
ii) select a particular chunk of it
iii) save it in a output.nc file.
import os
import xarray as xr
import numpy as np
rootdir ='/Users/sm/Desktop/along_track_J2'
data_new=[]
for subdir, dirs, files in os.walk(rootdir):
for file in files:
file_name= os.path.join(subdir, file)
df=xr.open_dataset(file_name)
df['longitude'] = ((df.longitude + 180) % 360 - 180).sortby(df.longitude)
ds=df.where((df.longitude>=-65) & (df.longitude<=-45) & (df.latitude>55), drop=True)
data_new.append(ds)
Somehow xarray cannot read the file and I see the following error:
File "", line 1, in
runfile('/Users/sm/Desktop/jason2_processing.py', wdir='/Users/sm/Desktop')
File "/Users/sm/anaconda3/lib/python3.7/site-packages/spyder_kernels/customize/spydercustomize.py", line 668, in runfile
execfile(filename, namespace)
File "/Users/sm/anaconda3/lib/python3.7/site-packages/spyder_kernels/customize/spydercustomize.py", line 108, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "/Users/sm/Desktop/jason2_processing.py", line 18, in
df=xr.open_dataset(file_name)
File "/Users/sm/anaconda3/lib/python3.7/site-packages/xarray/backends/api.py", line 320, in open_dataset
**backend_kwargs)
File "/Users/sm/anaconda3/lib/python3.7/site-packages/xarray/backends/netCDF4_.py", line 331, in open
ds = opener()
File "/Users/sm/anaconda3/lib/python3.7/site-packages/xarray/backends/netCDF4_.py", line 230, in _open_netcdf4_group
ds = nc4.Dataset(filename, mode=mode, **kwargs)
File "netCDF4/_netCDF4.pyx", line 2123, in netCDF4._netCDF4.Dataset.init
File "netCDF4/_netCDF4.pyx", line 1743, in netCDF4._netCDF4._ensure_nc_success
OSError: [Errno -51] NetCDF: Unknown file format: b'/Users/sm/Desktop/along_track_J2/.DS_Store'
Can anyone please help me with this. Thank you in advance.

OSError: [Errno -51] NetCDF: Unknown file format: b'/Users/sm/Desktop/along_track_J2/.DS_Store'
You are currently looping through all files, NetCDF and other (system) files. .DS_store is a file created by macOS, which isn't a NetCDF file. If you only want to process NetCDF files, something like this should work:
...
for file in files:
if file.split('.')[-1] == 'nc':
file_name= os.path.join(subdir, file)
df = xr.open_dataset(file_name)
....
if file.split('.')[-1] == 'nc': (the only thing which I added) basically checks if the file extension is .nc, and ignores other files.

Related

ValueError: "Unresolved named destination '_PAGE1'"error in PyPDF2

I am trying to merge some .pdf files in sub-folders
dir_name = r"E:\Data"
import os, PyPDF2
from PyPDF2 import PdfFileMerger, PdfFileReader
#hdir=r #path to the folder directory; would suggest using os.getcwd()
for root,dirs,files in os.walk(dir_name):
merger = PdfFileMerger()
for dir in dirs:
sub_dir = os.path.join(root, dir)
print(sub_dir)
for filename in os.listdir(sub_dir):
print(filename)
if filename.endswith(".pdf"):
filepath = os.path.join(sub_dir, filename)
#print(filepath)
merger.append(PdfFileReader(open(filepath, 'rb')))
#merger.write(str(filename))
merger.write(os.path.join(dir_name,dir+'.pdf'))
code runs as expected for some sub-folders. But it is giving error as -
ValueError: Unresolved named destination '_PAGE1'.
Help me to solve this issue?
Traceback of error
File "<ipython-input-5-bd9240b14192>", line 1, in <module>
runfile('E:/Data/xxx.py', wdir='E:/Data')
File "C:\Anaconda\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 827, in runfile
execfile(filename, namespace)
File "C:\Anaconda\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "E:/Data/xxx.py", line 23, in <module>
merger.append(PdfFileReader(open(filepath, 'rb')))
File "C:\Anaconda\lib\site-packages\PyPDF2\merger.py", line 203, in append
self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks)
File "C:\Anaconda\lib\site-packages\PyPDF2\merger.py", line 174, in merge
self._associate_dests_to_pages(srcpages)
File "C:\Anaconda\lib\site-packages\PyPDF2\merger.py", line 436, in _associate_dests_to_pages
raise ValueError("Unresolved named destination '%s'" % (nd['/Title'],))
ValueError: Unresolved named destination '_PAGE1'```

Having problem with opening same files openpyxl

Writting a program, that shuffels contents in files. All files are almost the same, but it doesn't work for some of them. Can't understand.
for file in allFiles:
print(file)
items = []
fileName = file
fileIndex = 1
directory = os.path.join(path, fileName[:-5].strip())
if not os.path.exists(directory):
os.mkdir(directory)
theFile = openpyxl.load_workbook(file)
allSheetNames = theFile.sheetnames
And after some quantity of files, it shows me these errors:
Traceback (most recent call last):
File "D:\staff\Python\NewProject\glow.py", line 25, in <module>
theFile = openpyxl.load_workbook(file)
File "C:\Users\User\AppData\Local\Programs\Python\Python38-32\lib\site-packages\openpyxl\reader\excel.py", line 313, in load_workbook
reader = ExcelReader(filename, read_only, keep_vba,
File "C:\Users\User\AppData\Local\Programs\Python\Python38-32\lib\site-packages\openpyxl\reader\excel.py", line 124, in __init__
self.archive = _validate_archive(fn)
File "C:\Users\User\AppData\Local\Programs\Python\Python38-32\lib\site-packages\openpyxl\reader\excel.py", line 96, in _validate_archive
archive = ZipFile(filename, 'r')
File "C:\Users\User\AppData\Local\Programs\Python\Python38-32\lib\zipfile.py", line 1269, in __init__
self._RealGetContents()
File "C:\Users\User\AppData\Local\Programs\Python\Python38-32\lib\zipfile.py", line 1336, in _RealGetContents
raise BadZipFile("File is not a zip file")
zipfile.BadZipFile: File is not a zip file
But before that everything worked fine, there was no error. Can someone guess, why? Thanks, everybody.
Looking for files that way:
path = os.getcwd()
sourcePath = os.getcwd() + '\source'
extension = 'xlsx'
os.chdir(sourcePath)
allFiles = glob.glob('*.{}'.format(extension))
You iterate over all files not regarding the filetype. Probably you or a process added a file to the directory which is no xlsx file. This is why openpyxl fails to read it.

Tablib xlsx file badZip file issue

I am getting error on opening xlsx extension file in windows 8 using tablib library.
python version - 2.7.14
error is as follows:
python suit_simple_sheet_product.py
Traceback (most recent call last):
File "suit_simple_sheet_product.py", line 19, in <module>
data = tablib.Dataset().load(open(BASE_PATH).read())
File "C:\Python27\lib\site-packages\tablib\core.py", line 446, in load
format = detect_format(in_stream)
File "C:\Python27\lib\site-packages\tablib\core.py", line 1157, in detect_format
if fmt.detect(stream):
File "C:\Python27\lib\site-packages\tablib\formats\_xls.py", line 25, in detect
xlrd.open_workbook(file_contents=stream)
File "C:\Python27\lib\site-packages\xlrd\__init__.py", line 120, in open_workbook
zf = zipfile.ZipFile(timemachine.BYTES_IO(file_contents))
File "C:\Python27\lib\zipfile.py", line 770, in __init__
self._RealGetContents()
File "C:\Python27\lib\zipfile.py", line 811, in _RealGetContents
raise BadZipfile, "File is not a zip file"
zipfile.BadZipfile: File is not a zip file
path location is as follows =
BASE_PATH = 'C:\Users\anju\Downloads\automate\catalog-5090 fabric detail and price list.xlsx'
Excel .xlsx files are actually zip files. In order for the unzip to work correctly, the file must be opened in binary mode, as such your need to open the file using:
import tablib
BASE_PATH = r'c:\my folder\my_test.xlsx'
data = tablib.Dataset().load(open(BASE_PATH, 'rb').read())
print data
Add r before your string to stop Python from trying to interpret the backslash characters in your path.

Python - program keeps iterating over deleted files

I've written a program that iterates over all CSV files in a directory and creates a new CSV file based on their contents.
I've written a function ('summary()') that performs these tasks and is called by the following code
cwd = os.getcwd()
csv_list = []
for root, dirs, filenames in os.walk(cwd):
for f in filenames:
if f.endswith('.csv'):
csv_list.append(f)
#for root, dirs, filenames in os.walk(cwd):
summary(csv_list)
Once the file has been loaded into the function, its added to a pandas DF by the following code
df = pd.concat((pd.read_csv(f, parse_dates=True, sep=';') for f in files))
The function creates a output csvfile called 'combined_csv'.
I delete this file between each run (as I currently testing the program).
However I keep running into the following peculiar bug.
FileNotFoundError: File 'combined.csv' does not exist
Even though I deleted the file, the program still parses over it - (where it crashes when it tries to load). Why though? I restart the program after deleting the file, the file should not appear in the 'csv_list' variable at all.
Is the information cached somehow?
I've added the full traceback below.
Traceback (most recent call last):
File "summary.py", line 112, in <module>
summary(csv_list)
File "summary.py", line 17, in summary
df = pd.concat((pd.read_csv(f, parse_dates=True, sep=';') for f in files))
File "/usr/local/lib/python3.5/dist-packages/pandas/core/reshape/concat.py", line 206, in concat
copy=copy)
File "/usr/local/lib/python3.5/dist-packages/pandas/core/reshape/concat.py", line 236, in __init__
objs = list(objs)
File "summary.py", line 17, in <genexpr>
df = pd.concat((pd.read_csv(f, parse_dates=True, sep=';') for f in files))
File "/usr/local/lib/python3.5/dist-packages/pandas/io/parsers.py", line 655, in parser_f
return _read(filepath_or_buffer, kwds)
File "/usr/local/lib/python3.5/dist-packages/pandas/io/parsers.py", line 405, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/usr/local/lib/python3.5/dist-packages/pandas/io/parsers.py", line 764, in __init__
self._make_engine(self.engine)
File "/usr/local/lib/python3.5/dist-packages/pandas/io/parsers.py", line 985, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/usr/local/lib/python3.5/dist-packages/pandas/io/parsers.py", line 1605, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas/_libs/parsers.pyx", line 394, in pandas._libs.parsers.TextReader.__cinit__ (pandas/_libs/parsers.c:4209)
File "pandas/_libs/parsers.pyx", line 710, in pandas._libs.parsers.TextReader._setup_parser_source (pandas/_libs/parsers.c:8873)
FileNotFoundError: File b'combined.csv' does not exist
Edit I've simplified the program (the code wasnt relevant to the problem, and changed the code to this. This is all of the code that is run.
I am executing the program from a terminal (Ubuntu 16.04), located in the directory.
$ pwd
returns
/home/jasper/PycharmProjects/AHP_Scanner/PVM/true_run/testsum
$ ls -a /home/jasper/PycharmProjects/AHP_Scanner/PVM/true_run/testsum
returns:
. fixed_10.csv fixed_13.csv fixed_16.csv fixed_19.csv fixed_21.csv fixed_4.csv fixed_7.csv goed
.. fixed_11.csv fixed_14.csv fixed_17.csv fixed_1.csv fixed_2.csv fixed_5.csv fixed_8.csv summary.py
fixed_0.csv fixed_12.csv fixed_15.csv fixed_18.csv fixed_20.csv fixed_3.csv fixed_6.csv fixed_9.csv
As we can see, the file 'combined_csv' does not exist
Yet when I run the following code: (this is all of the code that is run, the rest of summary.py has been commented out)
cwd = os.getcwd()
csv_list = []
for root, dirs, filenames in os.walk(cwd):
for f in filenames:
if f.endswith('.csv'):
print(f)
I get this response:
fixed_8.csv
fixed_10.csv
fixed_4.csv
fixed_11.csv
fixed_9.csv
fixed_7.csv
fixed_0.csv
fixed_12.csv
fixed_2.csv
fixed_5.csv
fixed_20.csv
fixed_18.csv
fixed_14.csv
fixed_6.csv
fixed_15.csv
fixed_3.csv
fixed_1.csv
fixed_17.csv
fixed_13.csv
fixed_19.csv
fixed_16.csv
fixed_21.csv
combined.csv
I am at a loss why this file keeps appearing.

python - extract csv files from 7z

I have lots of csv files contained in different 7z files. I want to find specific csv files in those 7z files and save them decompressed in a different directory.
I have tried
import os
import py7zlib
tree = r'Where_the_7zfiles_are_stored'
dst = r'Where_I_want_to_store_the_csvfiles'
for dirpath, dirname, filename in os.walk(tree):
for myfile in filename:
if myfile.endswith('2008-01-01_2008-04-30_1.7z'):
myZip = py7zlib.Archive7z(open(os.path.join(dirpath,myfile), 'rb'))
csvInZipFile = zip(myZip.filenames,myZip.files)
for myCsvFileName, myCsvFile in csvInZipFile:
if '2008-01' in myCsvFileName:
with open(os.path.join(dst,myCsvFileName),'wb') as outfile:
outfile.write(myCsvFile.read())
but I get the following error
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Users\'\Anaconda3\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 682, in runfile
execfile(filename, namespace)
File "C:\Users\'\Anaconda3\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 85, in execfile
exec(compile(open(filename, 'rb').read(), filename, 'exec'), namespace)
File "C:/Users//'/Documents/Example/unzipfiles.py", line 23, in <module>
outfile.write(myCsvFile.read())
File "C:\Users\'\Anaconda3\lib\site-packages\py7zlib.py", line 576, in read
data = getattr(self, decoder)(coder, data)
File "C:\Users\'\Anaconda3\lib\site-packages\py7zlib.py", line 634, in _read_lzma
return self._read_from_decompressor(coder, dec, input, checkremaining=True, with_cache=True)
File "C:\Users\'\Anaconda3\lib\site-packages\py7zlib.py", line 611, in _read_from_decompressor
tmp = decompressor.decompress(data)
ValueError: data error during decompression
The odd thing is that the method seems to work fine for the first two csv files. I have no idea how to get to the root of the problem. At least the data in the csv files do not seem to be different. Manually unpacking the different csv files using IZArc goes without problem. (The problem occurred in both python 2.7 and 3.4).
I have also tried to use the lzma module, but here I could not figure out how to retrieve the different csv files contained in the 7z file.

Categories

Resources