Python: search string in ziped files - python

There is any way to search some string in some file in zip file without unziping?
I have following structure of directories:
.
├───some_zip_file.zip
│ ├──some_directory.SAFE
│ │ ├──some_dir
│ │ ├──some_another_dir
│ │ ├──manifest.safe \\ search in this file

The zipfile module could help you:
It allows you to open a file from the zip to get a file-like object
Or you can also directly read a file from the archive
concretly, you can read and store the content of a file from the zip this way:
import zipfile
with zipfile.ZipFile("some_zip_file.zip", "r") as zip:
with zip.open("some_directory.SAFE/manifest.safe") as manifest:
content = manifest.read()
or:
import zipfile
with zipfile.ZipFile("some_zip_file.zip", "r") as zip:
content = zip.read("some_directory.SAFE/manifest.safe")

Related

Using glob recursion to get sub directories and files containing CSVs

I am trying to concat multiple CSVs that live in subfolders of my parent directory.
/ParentDirectory
│
│
├───SubFolder 1
│ test1.csv
│
├───SubFolder 2
│ test2.csv
│
├───SubFolder 3
│ test3.csv
│ test4.csv
│
├───SubFolder 4
│ test5.csv
When I do
import pandas as pd
import glob
files = glob.glob('/ParentDirectory/*.csv', recursive=True)
df = pd.concat([pd.read_csv(fp) for fp in files], ignore_index=True)
I get ValueError: No objects to concatenate.
But if I select a specific sub folder, it works:
files = glob.glob('/ParentDirectory/SubFolder 3/*.csv', recursive=True)
How come glob isn't able to go down a directory and get the CSVs within each folder of the parent directory?
Try:
files = glob.glob('/ParentDirectory/**/*.csv', recursive=True)
files = glob.glob('/ParentDirectory/*/*.csv')
It doesn't need to be recursive for that pattern, but does need a wildcard for the subdirectory.

How to efficiently convert multiple .gz files to one .tar.gz in Python?

I want to combine multiple .gz (not tar.gz) files into one .tar.gz file in Python.
I created temporary files, unzipped them one by one and used Tarfile#addfile.
I was able to do what I wanted to do, but I feel that it is inefficient.
I want to use a buffer.
Is there any efficient way?
Thank you.
The current situation is like this:
def make_tmp(gz_file):
with open(gz_file) as rt:
with open("tmp/" + gz_file, mode="wb") as w:
while True:
buf = rt.read(65535)
if not buf:
break
w.write(buf)
gz_files = os.listdir("target_gz")
for gz in gz_files:
make_tmp(gz)
with tarfile.open("combined.tar.gz", mode="w:gz") as tw:
for tmp in os.listdir("tmp")
tw.add(tmp)
I hope it looks like this:
with tarfile.open("combined.tar.gz", mode="w:gz") as tw:
for gz in os.listdir("target_gz"):
with open(gz, mode="rb") as r:
while True:
buf = rt.read(65535)
if not buf:
break
tw.write(gz[:12], buf) # I want to add a file, cut "target_file" and store it
Directory tree:
.
├── target_gz/
│ ├── foo.gz
│ └── bar.gz
├── tmp/
│ ├── foo.file
│ └── bar.file
├── run.py
└── combined.tar.gz
This should work:
import pathlib
import tarfile
with tarfile.open('combined.tar.gz', 'w') as tw:
for filename in pathlib.Path('./target_gz').glob('*.gz'):
print(filename)
tw.add(filename)

Python importing csv files within subfolders

Is there a way of importing all the files within folder1? Each csv file is contained within a subfolder. Below is the file structure.
C:/downloads/folder1 > tree /F
C:.
│ tree
│
├───2020-06
│ test1.csv
│
├───2020-07
│ test2.csv
│
├───2020-08
│ test3.csv
│
├───2020-09
│ test4.csv
I'm aware of glob, below, to take all files within a folder. However can this be used for subfolders?
import glob
import pandas as pd
# Get a list of all the csv files
csv_files = glob.____('*.csv')
# List comprehension that loads of all the files
dfs = [pd.read_csv(____) for ____ in ____]
# List comprehension that looks at the shape of all DataFrames
print(____)
Use the recursive keyword argument of the glob.glob() method:
glob.glob('**\\*.csv', recursive=True)
You can use os.walk to find all sub_folder and get the required files
here's a code sample
import os
import pandas as pd
path = '<Insert Path>'
file_extension = '.csv'
csv_file_list = []
for root, dirs, files in os.walk(path):
for name in files:
if name.endswith(file_extension):
file_path = os.path.join(root, name)
csv_file_list.append(file_path)
dfs = [pd.read_csv(f) for f in csv_file_list]
I found this on Kite's website, check it out
path = "./directory/src_folder"
text_files = glob.glob(path + "/**/*.txt", recursive = True)
print(text_files)
OUTPUT
['./directory/src_folder/src_file.txt', './directory/src_folder/subdirectory/subdirectory_file.txt']

Zipping files to the same folder level

This thread here advises to use shutilto zip files:
import shutil
shutil.make_archive(output_filename, 'zip', dir_name)
This zips everything in dir_name and maintains the folder structure in it. Is it possible to use this same library to remove all sub-folders and just zip all files in dir_name into the same level? Or must I introduce a separate code chunk to first consolidate the files? For eg., this is a hypothetical folder structure:
\dir_name
\dir1
\cat1
file1.txt
file2.txt
\cat2
file3.txt
\dir2
\cat3
file4.txt
Output zip should just contain:
file1.txt
file2.txt
file3.txt
file4.txt
shutil.make_archive does not have a way to do what you want without copying files to another directory, which is inefficient. Instead you can use a compression library directly similar to the linked answer you provided. Note this doesn't handle name collisions!
import zipfile
import os
with zipfile.ZipFile('output.zip','w',zipfile.ZIP_DEFLATED,compresslevel=9) as z:
for path,dirs,files in os.walk('dir_name'):
for file in files:
full = os.path.join(path,file)
z.write(full,file) # write the file, but with just the file's name not full path
# print the files in the zipfile
with zipfile.ZipFile('output.zip') as z:
for name in z.namelist():
print(name)
Given:
dir_name
├───dir1
│ ├───cat1
│ │ file1.txt
│ │ file2.txt
│ │
│ └───cat2
│ file3.txt
│
└───dir2
└───cat3
file4.txt
Output:
file1.txt
file2.txt
file3.txt
file4.txt
# The root directory to search for
path = r'dir_name/'
import os
import glob
# List all *.txt files in the root directory
file_paths = [file_path
for root_path, _, _ in os.walk(path)
for file_path in glob.glob(os.path.join(root_path, '*.txt'))]
import tempfile
# Create a temporary directory to copy your files into
with tempfile.TemporaryDirectory() as tmp:
import shutil
for file_path in file_paths:
# Get the basename of the file
basename = os.path.basename(file_path)
# Copy the file to the temporary directory
shutil.copyfile(file_path, os.path.join(tmp, basename))
# Zip the temporary directory to the working directory
shutil.make_archive('output', 'zip', tmp)
This will create a output.zip file in the current working directory. The temporary directory will be deleted when the end of the context manager is reached.

How to get filepath directory and use it to read my excel file? (Mac)

I'm creating a basketball data visualization app, and I've already completed the GUI, now just trying to import my database which is an excel file. I'm using pandas, and when I run this code, I get the "No such file or directory" error. I understand I must get the filepath, but how do I do this (Mac OS X) and implement it to direct my code to my file?
I tried directly copying and pasting the filepath with path = r'C:(insert path here)'
#Basketball DataVis (Data Visualization)
#pylint:disable = W0614
#By Robert Smith
#Import
import tkinter
import os
import pandas as pd
from tkinter import *
from PIL import Image, ImageTk
from pandas import *
#Import the excel file to use as a database
data = pd.read_excel("nbadata.xlsx", sheetname= "Sheet1")
Easiest way is to open an instance of the terminal and then drag the file into the terminal screen - this will print the path which you can then use in your script.
Note that mac filepaths don't begin with C:
I will suggest you to use recursive approach to solve your problem if you don't know where is your xlsx file (so you can't provide relative or absolute path) but you know the exact name of it and you also know the root directory under which this file exists.
For this kind of scenario, just pass the root path and filename to the recursive function and it will give a list of absolute paths of all matched file names.
Finally you can choose the 1st one from that list if you are sure there're no more files with the same name or you can print the list on console and retry.
I found this method best in my case and I have presented a simple example for that as follows.
Directory structure:
H:\RishikeshAgrawani\Projects\GenWork\Python3\try\test>tree . /f
Folder PATH listing for volume New Volume
Volume serial number is C867-828E
H:\RISHIKESHAGRAWANI\PROJECTS\GENWORK\PYTHON3\TRY\TEST
│ tree
│
├───c
│ docs.txt
│
├───cpp
│ docs.md
│
├───data
│ nbadata.xlsx
│
├───js
│ docs.js
│
├───matlab
│ docs.txt
│
├───py
│ │ docs.py
│ │
│ └───docs
│ docs.txt
│
└───r
docs.md
Here is the recursive implementation, please have a look and try.
import os
def search_file_and_get_abspaths(path, filename):
"""
Description
===========
- Gives list of absolute path of matched file names by performing recursive search
- [] will be returned in there is no such file under the given path
"""
matched_paths = []
if os.path.isdir(path):
files = os.listdir(path)
for file in files:
fullpath = os.path.join(path, file)
if os.path.isdir(fullpath):
# Recusive search in child directories
matched_paths += search_file_and_get_abspaths(fullpath, filename)
elif os.path.isfile(fullpath):
if fullpath.endswith(filename):
if not path in matched_paths:
matched_paths.append(fullpath)
return matched_paths
if __name__ == "__main__":
# Test case 1 (Multiple files exmample)
matched_paths = search_file_and_get_abspaths(r'H:\RishikeshAgrawani\Projects\GenWork\Python3\try\test', 'docs.txt');
print(matched_paths)
# Test case 2 (Single file example)
matched_paths2 = search_file_and_get_abspaths(r'H:\RishikeshAgrawani\Projects\GenWork\Python3\try\test', 'nbadata.xlsx');
print(matched_paths2)
# ['H:\\RishikeshAgrawani\\Projects\\GenWork\\Python3\\try\\test\\c\\docs.txt', 'H:\\RishikeshAgrawani\\Projects\\GenWork\\Python3\\try\\test\\matlab\\docs.txt', 'H:\\RishikeshAgrawani\\Projects\\GenWork\\Python3\\try\\test\\py\\docs\\docs.txt']
if matched_paths2:
xlsx_path = matched_paths2[0] # If your file name is unique then it will only be 1
print(xlsx_path) # H:\RishikeshAgrawani\Projects\GenWork\Python3\try\test\data\nbadata.xlsx
data = pd.read_excel(xlsx_path, sheetname= "Sheet1")
else:
print("Path does not exist")

Categories

Resources