How to unpack a zip file without changing metadata? [duplicate] - python

I'm trying to extract files from a zip file using Python 2.7.1 (on Windows, fyi) and each of my attempts shows extracted files with Modified Date = time of extraction (which is incorrect).
import os,zipfile
outDirectory = 'C:\\_TEMP\\'
inFile = 'test.zip'
fh = open(os.path.join(outDirectory,inFile),'rb')
z = zipfile.ZipFile(fh)
for name in z.namelist():
z.extract(name,outDirectory)
fh.close()
I also tried using the .extractall method, with the same results.
import os,zipfile
outDirectory = 'C:\\_TEMP\\'
inFile = 'test.zip'
zFile = zipfile.ZipFile(os.path.join(outDirectory,inFile))
zFile.extractall(outDirectory)
Can anyone tell me what I'm doing wrong?
I'd like to think this is possible without having to post-correct the modified time per How do I change the file creation date of a Windows file?.

Well, it does take a little post-processing, but it's not that bad:
import os
import zipfile
import time
outDirectory = 'C:\\TEMP\\'
inFile = 'test.zip'
fh = open(os.path.join(outDirectory,inFile),'rb')
z = zipfile.ZipFile(fh)
for f in z.infolist():
name, date_time = f.filename, f.date_time
name = os.path.join(outDirectory, name)
with open(name, 'wb') as outFile:
outFile.write(z.open(f).read())
date_time = time.mktime(date_time + (0, 0, -1))
os.utime(name, (date_time, date_time))
Okay, maybe it is that bad.

Based on Jia103's answer, I have developed a function (using Python 2.7.14) which preserves directory and file dates AFTER everything has been extracted. This isolates any ugliness in the function, and you can also use zipfile.Zipfile.extractAll() or whatever zip extract method you want:
import time
import zipfile
import os
# Restores the timestamps of zipfile contents.
def RestoreTimestampsOfZipContents(zipname, extract_dir):
for f in zipfile.ZipFile(zipname, 'r').infolist():
# path to this extracted f-item
fullpath = os.path.join(extract_dir, f.filename)
# still need to adjust the dt o/w item will have the current dt
date_time = time.mktime(f.date_time + (0, 0, -1))
# update dt
os.utime(fullpath, (date_time, date_time))
To preserve dates, just call this function after your extract is done.
Here's an example, from a script I wrote to zip/unzip game save directories:
z = zipfile.ZipFile(zipname, 'r')
print 'I have opened zipfile %s, ready to extract into %s' \
% (zipname, gamedir)
try: os.makedirs(gamedir)
except: pass # Most of the time dir already exists
z.extractall(gamedir)
RestoreTimestampsOfZipContents(zipname, gamedir) #<-- USED
print '%s zip extract done' % GameName[game]
Thanks everyone for your previous answers!

Based on Ethan Fuman's answer, I have developed this version (using Python 2.6.6) which is a little more consise:
zf = ZipFile('archive.zip', 'r')
for zi in zf.infolist():
zf.extract(zi)
date_time = time.mktime(zi.date_time + (0, 0, -1))
os.utime(zi.filename, (date_time, date_time))
zf.close()
This extracts to the current working directory and uses the ZipFile.extract() method to write the data instead of creating the file itself.

Based on Ber's answer, I have developed this version (using Python 2.7.11), which also accounts for directory mod dates.
from os import path, utime
from sys import exit
from time import mktime
from zipfile import ZipFile
def unzip(zipfile, outDirectory):
dirs = {}
with ZipFile(zipfile, 'r') as z:
for f in z.infolist():
name, date_time = f.filename, f.date_time
name = path.join(outDirectory, name)
z.extract(f, outDirectory)
# still need to adjust the dt o/w item will have the current dt
date_time = mktime(f.date_time + (0, 0, -1))
if (path.isdir(name)):
# changes to dir dt will have no effect right now since files are
# being created inside of it; hold the dt and apply it later
dirs[name] = date_time
else:
utime(name, (date_time, date_time))
# done creating files, now update dir dt
for name in dirs:
date_time = dirs[name]
utime(name, (date_time, date_time))
if __name__ == "__main__":
unzip('archive.zip', 'out')
exit(0)
Since directories are being modified as the extracted files are being created inside them, there appears to be no point in setting their dates with os.utime until after the extraction has completed, so this version caches the directory names and their timestamps till the very end.

Related

Incrementing a file name in python

I am making code which generates a new text file with today's date each time it is run. For exemple today's file name would be 2020-10-05. I would like to increment it so that if the program is run one or more times the same day it becomes 2020-10-05_1, _2 etc..
I have this code that I found from another question and i've tried tinkering with it but I'm still stuck. The problem is here they convert the file name to an int 1,2,3 and this way it works but this isn't the result I want.
def incrementfile():
todayday = datetime.datetime.today().date()
output_folder = "//10.2.30.61/c$/Qlikview_Tropal/Raport/"
highest_num = 0
for f in os.listdir(output_folder):
if os.path.isfile(os.path.join(output_folder, f)):
file_name = os.path.splitext(f)[0]
try:
file_num = int(file_name)
if file_num > highest_num:
highest_num = file_num
except ValueError:
print("The file name %s is not an integer. Skipping" % file_name)
output_file = os.path.join(output_folder, str(highest_num + 1) + f"{todayday}" + ".txt")
return output_file
How can I modify this code so that the output I get in the end is something like 2020-10-05_0, _1, _2 etc.. ?
Thanks !
I strongly recommend you to use pathlib instead of os.path.join. This is more convenient.
def incrementfile():
td = datetime.datetime.today().date()
path = pathlib.Path("/tmp") #set your output folder isntead of /tmp
inc = len(list(path.glob(f"{td}*")))+1
outfile = path/f"{td}_{inc}.txt"
return outfile
Not a direct answer to your question, but instead of using _1, _2 etc, you could use a full timestamp with date and current time, which would avoid duplication, EG:
from datetime import datetime
t = str(datetime.now()).replace(":", "-").replace(" ", "_")
print(t)
Example output:
2020-10-05_13-06-53.825870
I think this will work-
import os
import datetime
#assuming files will be .txt format
def incrementfile():
output_folder = "//10.2.30.61/c$/Qlikview_Tropal/Raport/"
files=os.listdir(output_folder)
current_name=datetime.date.today().strftime('%Y-%m-%d_0')
current_num=1
def nameChecker(name,files):
return True if name +'.txt' in files else False
while namChecker(current_name,files):
current_name+='_'+str(current_num)
current_num+=1
return current_name+'.txt'

urllib urlretrieve only saving final image in list of urls

I'm fairly new to using Python. I have been trying to set up a very basic web scraper to help speed up my workday, it is supposed to download images from a section of a website and save them.
I have a list of urls and I am trying to use urllib.request.urlretrieve to download all the images.
The output location (savepath) updates so it adds 1 to the current highest number in the folder.
I've tried a bunch of different ways but urlretrieve only saves the image from the last url in the list. Is there a way to download all the images in the url list?
to_download=['url1','url2','url3','url4']
for t in to_download:
urllib.request.urlretrieve(t, savepath)
This is the code I was trying to use to update the savepath every time
def getNextFilePath(photos):
highest_num = 0
for f in os.listdir(photos):
if os.path.isfile(os.path.join(photos, f)):
file_name = os.path.splitext(f)[0]
try:
file_num = int(file_name)
if file_num > highest_num:
highest_num = file_num
except ValueError:
'The file name "%s" is not an integer. Skipping' % file_name
output_file = os.path.join(output_folder, str(highest_num + 1))
return output_file
as suggested by #vks, you need to update savepath (otherwise you save each url onto the same file). One way to do so, is to use enumerate:
from urllib import request
to_download=['https://edition.cnn.com/','https://edition.cnn.com/','https://edition.cnn.com/','https://edition.cnn.com/']
for i, url in enumerate(to_download):
save_path = f'website_{i}.txt'
print(save_path)
request.urlretrieve(url, save_path)
which you may want to contract into:
from urllib import request
to_download=['https://edition.cnn.com/','https://edition.cnn.com/','https://edition.cnn.com/','https://edition.cnn.com/']
[request.urlretrieve(url, f'website_{i}.txt') for i, url in enumerate(to_download)]
see:
Python3 doc: Python enumerate doc
Example of enumerate: enumerate example
Example of f' using a string with a {variable}': f string example
FOR SECOND PART OF THE QUESTION:
Not sure what you are trying to achieve but:
def getNextFilePath(photos):
file_list = os.listdir(photos)
file_list = [int(s) for s in file_list if s.isdigit()]
print(file_list)
max_id_file = max(file_list)
print(f'max id:{max_id_file}')
output_file = os.path.join(output_folder, str(max_id_file + 1))
print(f'output file path:{output_file}')
return output_file
this will hopefully find all files that are named with digits (IDs), and find the highest ID, and return a new file name as a max_id+1
I guess that this will replace the save_path in your example.
Which quickly coding, AND MODIFYING above function, so that it returns the max_id and not the path.
The bellow code be a working example using the iterrator:
import os
from urllib import request
photo_folder = os.path.curdir
def getNextFilePath(photos):
file_list = os.listdir(photos)
print(file_list)
file_list = [int(os.path.splitext(s)[0]) for s in file_list if os.path.splitext(s)[0].isdigit()]
if not file_list:
return 0
print(file_list)
max_id_file = max(file_list)
#print(f'max id:{max_id_file}')
#output_file = os.path.join(photo_folder, str(max_id_file + 1))
#print(f'output file path:{output_file}')
return max_id_file
def download_pic(to_download):
start_id = getNextFilePath(photo_folder)
for i, url in enumerate(to_download):
save_path = f'{i+start_id}.png'
output_file = os.path.join(photo_folder, save_path)
print(output_file)
request.urlretrieve(url, output_file)
You should add handling exception etc, but this seems to be working, if I understood correctly.
Are you updating savepath? If you pass the same savepath to each loop iteration, it is likely just overwriting the same file over and over.
Hope that helps, happy coding!

Get file size, creation date and modification date in Python

I need to get file info (path, size, dates, etc) and save it in a txt but I don't know where or how to do it.
This is what I have:
ruta = "FolderPath"
os.listdir(path=ruta)
miArchivo = open("TxtPath","w")
def getListOfFiles(ruta):
listOfFile = os.listdir(ruta)
allFiles = list()
for entry in listOfFile:
fullPath = os.path.join(ruta, entry)
if os.path.isdir(fullPath):
allFiles = allFiles + getListOfFiles(fullPath)
else:
allFiles.append(fullPath)
return allFiles
listOfFiles = getListOfFiles(ruta)
for elem in listOfFiles:
print(elem)
print("\n")
miArchivo.write("%s\n" % (elem))
miArchivo.close()
The output is (only path, no other info):
What I want is:
V:\1111111\222222222\333333333\444444444\5555555555\66666666\Folder\File name -- size -- modification date and so on
I think that you may want to use scandir instead of listdir for this:
for item in os.scandir(my_path):
print(item.name, item.path, item.stat().st_size, item.stat().st_atime)
You will also want to check here for more detailed information regarding the appropriate calls (for the time you are looking for and the size). (os.scandir was added in python 3.6)
https://docs.python.org/2.7/library/os.path.html#module-os.path
os.path.getsize(path) # size in bytes
os.path.ctime(path) # time of last metadata change; it's a bit OS specific.
Here's a rewrite of your program. I did this:
Reformatted with autopep8 for better readability. (That's something you can install to prettify your code your code. But IDEs such as PyCharm Community Edition can help you to do the same, in addition to helping you with code completion and a GUI debugger.)
Made your getListofFiles() return a list of tuples. There are three elements in each one; the filename, the size, and the timestamp of the file, which appears to be what's known as an epoch time (time in seconds since 1970; you will have to go through python documentation on dates and times).
The tuples is written to your text file in a .csv style format (but note there are modules to do the same in a much better way).
Rewritten code:
import os
def getListOfFiles(ruta):
listOfFile = os.listdir(ruta)
allFiles = list()
for entry in listOfFile:
fullPath = os.path.join(ruta, entry)
if os.path.isdir(fullPath):
allFiles = allFiles + getListOfFiles(fullPath)
else:
print('getting size of fullPath: ' + fullPath)
size = os.path.getsize(fullPath)
ctime = os.path.getctime(fullPath)
item = (fullPath, size, ctime)
allFiles.append(item)
return allFiles
ruta = "FolderPath"
miArchivo = open("TxtPath", "w")
listOfFiles = getListOfFiles(ruta)
for elem in listOfFiles:
miArchivo.write("%s,%s,%s\n" % (elem[0], elem[1], elem[2]))
miArchivo.close()
Now it does this.
my-MBP:verynew macbookuser$ python verynew.py; cat TxtPath
getting size of fullPath: FolderPath/dir2/file2
getting size of fullPath: FolderPath/dir2/file1
getting size of fullPath: FolderPath/dir1/file1
FolderPath/dir2/file2,3,1583242888.4
FolderPath/dir2/file1,1,1583242490.17
FolderPath/dir1/file1,1,1583242490.17
my-MBP:verynew macbookuser$
To interpret the dates, use https://stackoverflow.com/a/52858040/11262633. Building on YamiOmar88's great answer:
import os
import datetime
def ts_to_dt(ts):
return datetime.datetime.fromtimestamp(ts)
for item in os.scandir(my_path):
print(item.name, item.path, item.stat().st_size, ts_to_dt(item.stat().st_atime))

Paraview - Using python script to export data in x3d format

I am trying to export in x3d format OpenFOAM results using paraview-python script. When I do it via paraview graphical interface it works and results can be visualized in Blender, see the following picture
However, when I try to do the same operation using the following script
from paraview.simple import *
import fnmatch
import os
import shutil
#create alist of all vtk files
vtkFiles = []
for root, dirnames, filenames in os.walk('.'):
for filename in fnmatch.filter(filenames, '*.vtk'):
vtkFiles.append(os.path.join(root, filename))
vtkFilesGroups=[
'U',
]
def ResetSession():
pxm = servermanager.ProxyManager()
pxm.UnRegisterProxies()
del pxm
Disconnect()
Connect()
def x3dExport(output,r):
#export in x3d format
exporters = servermanager.createModule("exporters")
Show(r)
view = GetActiveView()
render = Render()
x3dExporter = exporters.X3DExporter(FileName=output)
x3dExporter.SetView(view)
x3dExporter.Write()
ResetSession()
# group VTK files by gruop (fields in openfoam "vtkFilesGroups")
# then loop over all and save it into different formats
groupedVtkFiles=[]
for group in vtkFilesGroups:
vtkDir = os.path.join('.', group, 'vtk')
if not os.path.exists(vtkDir):
os.makedirs(vtkDir)
vtuDir = os.path.join('.', group, 'vtu')
if not os.path.exists(vtuDir):
os.makedirs(vtuDir)
x3dDir = os.path.join('.', group, 'x3d')
if not os.path.exists(x3dDir):
os.makedirs(x3dDir)
for stepFile in vtkFiles:
tmp = stepFile.split(os.sep)
oldFileName = tmp[-1].split('.')[0]
time = tmp[-2]
fileNameVtk = '{}_{}.vtk'.format(oldFileName, time)
fileNameVtp = '{}_{}.vtp'.format(oldFileName, time)
fileNameX3d = '{}_{}.x3d'.format(oldFileName, time)
r = LegacyVTKReader(FileNames=[stepFile])
w = XMLUnstructuredGridWriter()
w.FileName = os.path.join(vtuDir, fileNameVtp)
w.UpdatePipeline()
x3dExport(os.path.join(x3dDir, fileNameX3d), r)
the field values (velocity U) are not exported as you can see from this picture!
Can someone tell me what I am doing wrong?
Thank you!
Your problem is that the .foam file it's not a scientific visualization file, as VTK, .foam file is only used for ParaView (by its extension, not by its content) to identify the reader OpenFOAMReader and then us it for post-processing.
I have two solutions for you:
Read the reader documentation to find a way to do this.
Convert the results into VTK files with FoamToVTK and then loop over the results.
EDIT
I Use this code to transform do that thing long time ago:
from paraview.simple import *
import fnmatch
import os
import shutil
#create alist of all vtk files
vtkFiles = []
for root, dirnames, filenames in os.walk('.'):
for filename in fnmatch.filter(filenames, '*.vtk'):
vtkFiles.append(os.path.join(root, filename))
vtkFilesGroups=('p', 'U')
def ResetSession():
pxm = servermanager.ProxyManager()
pxm.UnRegisterProxies()
del pxm
Disconnect()
Connect()
def x3dExport(output,r):
#export in x3d format
exporters = servermanager.createModule("exporters")
Show(r)
view = GetActiveView()
render = Render()
x3dExporter = exporters.X3DExporter(FileName=output)
x3dExporter.SetView(view)
x3dExporter.Write()
ResetSession()
# group VTK files by gruop (fields in openfoam "vtkFilesGroups")
# then loop over all and save it into different formats
for group in vtkFilesGroups:
x3dDir = os.path.join('.', group, 'x3d')
if not os.path.exists(x3dDir):
os.makedirs(x3dDir)
for stepFile in (f for f in vtkFiles if group in f):
tmp = stepFile.split(os.sep)
oldFileName = tmp[-1].split('.')[0]
time = tmp[-2]
fileNameX3d = '{}_{}.x3d'.format(oldFileName, time)
x3dExport(os.path.join(x3dDir, fileNameX3d), r)
You need to color your data in your script, with something like :
ColorBy(yourRep, ('POINTS', ('YourScalar', 'YourComp'))
Documentation

How to extract the file name from a file path?

I have the following code:
os.listdir("staging")
# Seperate filename from extension
sep = os.sep
# Change the casing
for n in os.listdir("staging"):
print(n)
if os.path.isfile("staging" + sep + n):
filename_one, extension = os.path.splitext(n)
os.rename("staging" + sep + n, "staging" + sep + filename_one.lower() + extension)
# Show the new file names
print ('\n--------------------------------\n')
for n in os.listdir("staging"):
print (n)
# Remove the blanks, -, %, and /
for n in os.listdir("staging"):
print (n)
if os.path.isfile("staging" + sep + n):
filename_zero, extension = os.path.splitext(n)
os.rename("staging" + sep + n , "staging" + sep + filename_zero.replace(' ','_').replace('-','_').replace('%','pct').replace('/','_') + extension)
# Show the new file names
print ('\n--------------------------------\n')
for n in os.listdir("staging"):
print (n)
"""
In order to fix all of the column headers and to solve the encoding issues and remove nulls,
first read in all of the CSV's to python as dataframes, then make changes and rewrite the old files
"""
import os
import glob
import pandas as pd
files = glob.glob(os.path.join("staging" + "/*.csv"))
print(files)
# Create an empty dictionary to hold the dataframes from csvs
dict_ = {}
# Write the files into the dictionary
for file in files:
dict_[file] = pd.read_csv(file, header = 0, dtype = str, encoding = 'cp1252').fillna('')
In the dictionary, the dataframes are named as "folder/name(csv)" what I would like to do is remove the prefix "staging/" from the keys in the dictionary.
How can I do this?
If all you want to do is truncate the file paths to just the filename, you can use os.path.basename:
for file in files:
fname = os.path.basename(file)
dict_[fname] = (pd.read_csv(file, header=0, dtype=str, encoding='cp1252')
.fillna(''))
Example:
os.path.basename('Desktop/test.txt')
# 'test.txt'
import os
pathname ='c:\\hello\\dickins\\myfile.py'
head, tail = os.path.split(pathname)
print head
print tail
This article here worked out just fine for me
import os
inputFilepath = 'path/to/file/foobar.txt'
filename_w_ext = os.path.basename(inputFilepath)
filename, file_extension = os.path.splitext(filename_w_ext)
#filename = foobar
#file_extension = .txt
path, filename = os.path.split(path/to/file/foobar.txt)
# path = path/to/file
# filename = foobar.txt
Hope it helps someone searching for this answer
In the same spirt as truncate the file paths, use pathlib in python standard library. It will turn the path into an easy to use class.
from pathlib import Path
path = Path('Desktop/folder/test.txt')
path.name # test.txt
path.stem # test
path.suffix # .txt
path.parent.name # folder
path.parent.name.name # Desktop
As ColdSpeed said, you can use "os.path.basename" to truncate a file to its name, but I think what you are refering to is the ability to pycache the data?
For Example here is my Directory:
You see the pycache folder? that initializes it as a module.
Then, you can import a file from that module (for example the staging.txt file and operate on it.)
I use the IpConfig.txt File from the assets folder level (or should be) and take a line of information out of it.
import pygame as pyg
import sys
import os
import math
import ssl
import socket as sock
import ipaddress as ipad
import threading
import random
print("Modules Installed!")
class two:
# Find out how to refer to class super construct
def main(Display, SecSock, ipadd, clock):
# I have code here that has nothing to do with the question...
def __init__():
print("Initializing[2]...")
# Initialization of Pygame and SSL Socket goes here
searchQuery = open("IpConfig.txt", 'r') #Opening the File IpConfig(Which now should open on the top level of the game files)
step2 = searchQuery.readlines()# read the file
ipadd = step2[6] # This is what you should have or something similar where you reference the line you want to copy or manipulate.
main(gameDisplay, SSLSock, ipadd, clock)# Im having issues here myself - (main() is not defined it says)
print(ipadd)
print("Server Certificate Configuration Enabled...")
__init__() # Start up the procedure

Categories

Resources