How to extract a mult-part zip file in python? - python

Suposse that I have some files that I downloaded from a server and they are zipped with 7zip in multiple parts, the format is something like this myfile.zip.001, myfile.zip.002, ..., myfile.zip.00n. Basically, I need to extract the content of it in the same folder where they are stored.
I tried using zipfile, patoolib and pyunpack without success, here is what I've done:
file_path = r"C:\Users\user\Documents\myfile.zip.001" #I also tested with only .zip
extract_path = r"C:\Users\user\Documents\"
#"
import zipfile
with zipfile.ZipFile(file_path, "r") as zip_ref:
zip_ref.extractall(extract_path) # myfile.zip.001 file isn't zip file.
from pyunpack import Archive
Archive(file_path).extractall(extract_path) # File is not a zip file
import patoolib
patoolib.extract_archive(file_path, outdir=extract_path) # unknown archive format for file `myfile.zip.001'
Another way (that works, but it's very ugly) is this one:
import os
import subprocess
path_7zip = r"C:\Program Files (x86)\7-Zip\7z.exe"
cmd = [path_7zip, 'x', 'myfile.zip.001']
sp = subprocess.Popen(cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
But this makes the user install 7zip in his computer, which isn't a good approach of what I'm looking for.
So, the question is: there is at least a way to extract/unzip multi-parts files with the format x.zip.001 in python?

You seem to be on the right track with zipfile, but you most likely have to concatenate the zip file before using extractall.
import os
zip_prefix = "myfile.zip."
# N number of parts
import glob
parts = glob.glob(zip_prefix + '*')
n = len(parts)
# Concatenate
with open("myfile.zip", "wb") as outfile:
for i in range(1, n+1):
filename = zip_prefix + str(i).zfill(3)
with open(filename, "rb") as infile:
outfile.write(infile.read())
# Extract
import zipfile
with zipfile.ZipFile(file_path, "r") as zip_ref:
zip_ref.extractall(extract_path)

Related

Get only the txt file you want from the folder containing the txt file - Python

I have a folder with a .txt files. the name of the files are:
my_file1.txt
my_file2.txt
my_file3.txt
my_file4.txt
In this way, only the last number is different.
import pickle
my_list = []
with open("/Users/users_a/Desktop/website-basic/sub_domain/sub_domain01.txt", "rb") as f1,
open("/Users/users_a/Desktop/website-ba\
sic/sub_domain/sub_domain02.txt", "rb") as f2, open("/Users/users_a/Desktop/website-
basic/sub_domain/sub_domain03.txt", "rb") as f3:
my_list.append(pickle.load(f1))
my_list.append(pickle.load(f2))
my_list.append(pickle.load(f3))
print(my_list)
In this way, I load a file and put it in the my_list variable to make a list and work. As the number of files to work increases, the code becomes too long and cumbersome.
Is there an easier and more pythonic way to load only the desired txt file??
You can use os.listdir():
import os
import pickle
my_list = []
path = "/Users/users_a/Desktop/website-basic/sub_domain"
for file in os.listdir(path):
if file.endswith(".txt"):
with open(f"{path}/{file}","r") as f:
my_list.append(pickle.load(f))
Where file is the filename of a file in path
I suggest using os.path.join() instead of hard coding the file paths
If your folder only contains the files you want to load you can just use:
for file in os.listdir(path):
with open(f"{path}/{file}","r") as f:
my_list.append(pickle.load(f))
Edit for my_file[number].txt
If you only want files in the form of my_file[number].txt, use:
import os
import re
import pickle
my_list = []
path = "/Users/users_a/Desktop/website-basic/sub_domain"
for file in os.listdir(path):
if re.match(r"my_file\d+.txt", file):
with open(f"{path}/{file}","r") as f:
my_list.append(pickle.load(f))
Online regex demo https://regex101.com/r/XJb2DF/1

Merging specific files of a directory in python

I have a directory C:/newdir/ It contains the following files with filenames :
s1_student1_file
t1_teacher2_file
hab_parent5_file
y1_professor_file
bsa_assistant2_file
t1_student_file
nas_officer_file
ee1_newguy15_file
ee1_professor15_file
f1_student8_file
I want to merge the content of s1_student1_file, t1_teacher2_file, t1_student_file, y1_professor_file, ee1_newguy15_file, f1_student8_file, and ee1_professor15_file into a new file called all_file and deleting the files that have been merged from the directory I have to write a python code for this but cannot figure how.
You can use glob to list all the chosen text files in a chosen folder. Then, you can use a for-loop to loop over all the text file and write the content into another file:
from glob import glob
with open('all_file.txt','a') as f:
for file in glob('s1*')+glob('t1*')+glob('y1*')+glob('ee1*'):
with open(file+'txt','r') as r:
f.write(r.read())
To remove the files afterwards:
from glob import glob
import os
with open('all_file.txt','a') as f:
for file in glob('s1*')+glob('t1*')+glob('y1*')+glob('ee1*'):
with open(file,'r') as r:
f.write(r.read())
os.remove(file)
Import shutil & pathlib libraries
You can install the libraries using this command –
pip install shutil
pip install pathlib
Python Implementation
import shutil
from pathlib import Path
firstfile = Path(r'C:\Users\Sohom\Desktop\A.txt')
secondfile = Path(r'C:\Users\Sohom\Desktop\B.txt')
newfile = input("Enter the name of the new file: ")
print()
print("The merged content of the 2 files will be in", newfile)
with open(newfile, "wb") as wfd:
for f in [firstfile, secondfile]:
with open(f, "rb") as fd:
shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10)
print("\nThe content is merged successfully.!")
print("Do you want to view it ? (y / n): ")
check = input()
if check == 'n':
exit()
else:
print()
c = open(newfile, "r")
print(c.read())
c.close()
......
Create Array of text files dynamically

find and replace string from multiple files in a folder using python

I want to find string e.g. "Version1" from my files of a folder which contains multiple ".c" and ".h" files in it and replace it with "Version2.2.1" using python file.
Anyone know how this can be done?
Here's a solution using os, glob and ntpath. The results are saved in a directory called "output". You need to put this in the directory where you have the .c and .h files and run it.
Create a separate directory called output and put the edited files there:
import glob
import ntpath
import os
output_dir = "output"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for f in glob.glob("*.[ch]"):
with open(f, 'r') as inputfile:
with open('%s/%s' % (output_dir, ntpath.basename(f)), 'w') as outputfile:
for line in inputfile:
outputfile.write(line.replace('Version1', 'Version2.2.1'))
Replace strings in place:
IMPORTANT! Please make sure to back up your files before running this:
import glob
for f in glob.glob("*.[ch]"):
with open(f, "r") as inputfile:
newText = inputfile.read().replace('Version1', 'Version2.2.1')
with open(f, "w") as outputfile:
outputfile.write(newText)

Read and write multiple files files in python? [duplicate]

I want to write a program for this: In a folder I have n number of files; first read one file and perform some operation then store result in a separate file. Then read 2nd file, perform operation again and save result in new 2nd file. Do the same procedure for n number of files. The program reads all files one by one and stores results of each file separately. Please give examples how I can do it.
I think what you miss is how to retrieve all the files in that directory.
To do so, use the glob module.
Here is an example which will duplicate all the files with extension *.txt to files with extension *.out
import glob
list_of_files = glob.glob('./*.txt') # create the list of file
for file_name in list_of_files:
FI = open(file_name, 'r')
FO = open(file_name.replace('txt', 'out'), 'w')
for line in FI:
FO.write(line)
FI.close()
FO.close()
import sys
# argv is your commandline arguments, argv[0] is your program name, so skip it
for n in sys.argv[1:]:
print(n) #print out the filename we are currently processing
input = open(n, "r")
output = open(n + ".out", "w")
# do some processing
input.close()
output.close()
Then call it like:
./foo.py bar.txt baz.txt
You may find the fileinput module useful. It is designed for exactly this problem.
I've just learned of the os.walk() command recently, and it may help you here.
It allows you to walk down a directory tree structure.
import os
OUTPUT_DIR = 'C:\\RESULTS'
for path, dirs, files in os.walk('.'):
for file in files:
read_f = open(os.join(path,file),'r')
write_f = open(os.path.join(OUTPUT_DIR,file))
# Do stuff
Combined answer incorporating directory or specific list of filenames arguments:
import sys
import os.path
import glob
def processFile(filename):
fileHandle = open(filename, "r")
for line in fileHandle:
# do some processing
pass
fileHandle.close()
def outputResults(filename):
output_filemask = "out"
fileHandle = open("%s.%s" % (filename, output_filemask), "w")
# do some processing
fileHandle.write('processed\n')
fileHandle.close()
def processFiles(args):
input_filemask = "log"
directory = args[1]
if os.path.isdir(directory):
print "processing a directory"
list_of_files = glob.glob('%s/*.%s' % (directory, input_filemask))
else:
print "processing a list of files"
list_of_files = sys.argv[1:]
for file_name in list_of_files:
print file_name
processFile(file_name)
outputResults(file_name)
if __name__ == '__main__':
if (len(sys.argv) > 1):
processFiles(sys.argv)
else:
print 'usage message'
from pylab import *
import csv
import os
import glob
import re
x=[]
y=[]
f=open("one.txt",'w')
for infile in glob.glob(('*.csv')):
# print "" +infile
csv23=csv2rec(""+infile,'rb',delimiter=',')
for line in csv23:
x.append(line[1])
# print len(x)
for i in range(3000,8000):
y.append(x[i])
print ""+infile,"\t",mean(y)
print >>f,""+infile,"\t\t",mean(y)
del y[:len(y)]
del x[:len(x)]
I know I saw this double with open() somewhere but couldn't remember where. So I built a small example in case someone needs.
""" A module to clean code(js, py, json or whatever) files saved as .txt files to
be used in HTML code blocks. """
from os import listdir
from os.path import abspath, dirname, splitext
from re import sub, MULTILINE
def cleanForHTML():
""" This function will search a directory text files to be edited. """
## define some regex for our search and replace. We are looking for <, > and &
## To replaced with &ls;, > and &. We might want to replace proper whitespace
## chars to as well? (r'\t', ' ') and (f'\n', '<br>')
search_ = ((r'(<)', '<'), (r'(>)', '>'), (r'(&)', '&'))
## Read and loop our file location. Our location is the same one that our python file is in.
for loc in listdir(abspath(dirname(__file__))):
## Here we split our filename into it's parts ('fileName', '.txt')
name = splitext(loc)
if name[1] == '.txt':
## we found our .txt file so we can start file operations.
with open(loc, 'r') as file_1, open(f'{name[0]}(fixed){name[1]}', 'w') as file_2:
## read our first file
retFile = file_1.read()
## find and replace some text.
for find_ in search_:
retFile = sub(find_[0], find_[1], retFile, 0, MULTILINE)
## finally we can write to our newly created text file.
file_2.write(retFile)
This thing also works for reading multiple files, my file name is fedaralist_1.txt and federalist_2.txt and like this, I have 84 files till fedaralist_84.txt
And I'm reading the files as f.
for file in filename:
with open(f'federalist_{file}.txt','r') as f:
f.read()

Compress all files in a folder with python?

this code takes a bunch of files in a folder (based on the file name), zips them into bz2 and adds them into a tar file. Is there a way I can modify this to only compress the files into bz2 (or gzip)? I do not want to have to deal with having them packaged into a tar. I just want to go through each file in a directory and compress it.
import os
from glob import glob
import tarfile
os.chdir(r'C:\Documents\FTP\\')
compression = "w:bz2"
extension = '.tar.bz2'
filename = 'survey_'
filetype = 'survey_report_*.csv'
tarname = saveloc+filename+extension
files = glob(filetype)
tar = tarfile.open(tarname, compression)
for file in files:
if file not in tarname:
print('Packaging file:', file)
tar.add(file)
tar.close()
EDIT:
This code seems to work for some files, but for other ones it makes them 1kb and when I open it there are just some random characters. Any suggestions?
import bz2
import os
location = r'C:\Users\Documents\FTP\\'
os.chdir(location)
filelist = os.listdir(location)
for file in filelist:
data = open(file).read()
try:
output = bz2.BZ2File(file + '.bz2', 'wb')
output.write(data)
finally:
output.close()

Categories

Resources