I am trying to learn how to update a .txt filename when os.walk switches from files in one directory to files in another directory. I am not sure about how to do this. I tried iterating through dirs and then files, but this was unsuccessful as the .pdf files would not display. Here is a full look at the code I am working on.
The directory looks like this [research] -> [2014] -> Article1.pdf, article2.pdf article3.pdf
[2015] -> Article4.pdf, article5.pdf article6.pdf
[2016] -> Article7.pdf, article8.pdf article9.pdf
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
import os
pytesseract.pytesseract.tesseract_cmd = r'/usr/local/Cellar/tesseract/4.1.1/bin/tesseract'
def image_ocr(image_path, output_txt_file_name, All_text):
image_text = pytesseract.image_to_string(
image_path, lang='eng+ces', config='--psm 1')
with open(output_txt_file_name, 'a', encoding='utf-8') as f:
f.write(image_text)
with open(All_text, 'a', encoding='utf-8') as f:
f.write(image_text)
num = 0
year = 1973
year_being_recorded = 'txt_files/' + str(year) + '_article.txt'
cumulative_text = 'txt_files/cumulative.txt'
for root, dirs, files in os.walk('articles'):
for file_ in files:
if file_.endswith('.pdf'):
article_path = str(root) + '/' + str(file_)
pages = convert_from_path(article_path, 500)
for page in pages:
name = 'jpegs/a_file_' + str(num) + '.jpeg'
page.save(name, 'JPEG')
image_ocr(name, year_being_recorded, cumulative_text)
num = num + 1
Related
import os
import shutil
src_folder = r"C:\new1\\"
dst_folder = r"C:\new2\\"
file_name = 'testword.docx'
if os.path.exists(dst_folder + file_name):
data = os.path.splitext(file_name)
only_name = data[0]
extension = data[1]
new_base = only_name + 'Renamed' + extension
new_name = os.path.join(dst_folder, new_base)
shutil.move(src_folder + file_name, new_name)
else:
shutil.move(src_folder + file_name, dst_folder + file_name)
I was trying to write a code to move a file from one folder to another and rename it. The file is moving to another folder, But I can't rename it. I am doing this using python(spyder). Can anyone help me with this.
Using another stackoverflow question & answer, I was able to locate code which partially resolves what I am trying to do Merge PDF files.
However, this modified code results in the contents of two PDFs overlapping each outer. I am trying to stack them or vertically concatenate the results:
Example:
PDF1 Contents -> "Hello World"
PDF2 Contents -> "I am Bill"
Code below results in the following overlapping image:
Desired results would look as follows:
Code Used resulting in overlapping imge:
import pdfrw
dirPATH = r'c:\users\<username>\projects\concat_pdfs'
pdf1 = os.path.join(dirPATH, 'PDF1.pdf')
pdf2 = os.path.join(dirPATH, 'PDF2.pdf')
def concat_pdfs(pdf1, pdf2, output):
form = pdfrw.PdfReader(pdf1)
olay = pdfrw.PdfReader(pdf2)
for form_page, overlay_page in zip(form.pages, olay.pages):
merge_obj = pdfrw.PageMerge()
overlay = merge_obj.add(overlay_page)[0]
pdfrw.PageMerge(form_page).add(overlay).render()
writer = pdfrw.PdfWriter()
writer.write(output, form)
concat_pdfs(section1, section2, 'result.pdf')
Thanks in advance!
Have you tried
def combine_pdfs(dir_path1, dir_path2, save_path):
pdf1 = pdfrw.PdfReader(dir_path1)
pdf2 = pdfrw.PdfReader(dir_path2)
pdf_writer = pdfrw.PdfWriter()
for page in pdf1.pages:
pdf_writer.addpage(page)
for page in pdf2.pages:
pdf_writer.addpage(page)
pdf_writer.write(save_path)
Here's an example using PyPDF2 library:
merger = PdfFileMerger()
for filename in files:
f = files[filename]
loc = "/tmp/" + secure_filename(filename).replace(".pdf", "") + "_" + str(time.time()) + ".pdf"
f.save(loc)
f.close()
reader = PdfFileReader(loc, "rb")
merger.append(reader)
dest = "/tmp/merged_" + str(time.time()) + ".pdf"
merger.write(dest)
Here is another using pike pdf:
pdf = Pdf.new()
for filename in files:
f = files[filename]
loc = "/tmp/" + secure_filename(filename).replace(".pdf", "") + "_" + str(time.time()) + ".pdf"
f.save(loc)
f.close()
reader = Pdf.open(loc)
pdf.pages.extend(reader.pages)
dest = "/tmp/merged_" + str(time.time()) + ".pdf"
pdf.save(dest)
Imports might look something like:
import time
import pdfkit
import os
from PyPDF2 import PdfFileMerger, PdfFileReader
from werkzeug.utils import secure_filename
from pikepdf import Pdf
This piece of code is my first attempt at creating a program. I'm getting an error when running it that reads:
PermissionError: [WinError 32] The process cannot access the file
because it is being used by another process:
'C:\Users\gabri\Desktop\' -> 'C:\Users\gabri\Desktop\Planilhas
Excel\'
What am I doing wrong? The goal of this program is to get all excel, then pdf, then word files and put them in folders created by the program.
import os
from glob import glob
# import cx_Freeze
print("Digite o diretório de origem.")
dirOrigem = input()
os.chdir(dirOrigem)
excel_files = glob('*.xlsx')
excel_files.append(''.join(glob('*.xls')))
dirDestinoXL = dirOrigem + '\\' + 'Planilhas Excel'
if not os.path.exists(dirDestinoXL):
os.makedirs(dirDestinoXL)
for i in excel_files:
os.rename(f'{dirOrigem}\\{"".join(i)}', f'{dirDestinoXL}\\{"".join(i)}')
os.chdir(dirOrigem)
pdf_files = glob('*.pdf')
dirDestinoPDF = dirOrigem + '\\' + 'PDF'
if not os.path.exists(dirDestinoPDF):
os.makedirs(dirDestinoPDF)
for p in pdf_files:
os.rename(f'{dirOrigem}\\{"".join(p)}', f'{dirDestinoPDF}\\{"".join(p)}')
os.chdir(dirOrigem)
word_files = glob('*.doc')
word_files.append(glob('*.docx'))
dirDestinoWord = dirOrigem + '\\' + 'Word'
if not os.path.exists(dirDestinoWord):
os.makedirs(dirDestinoWord)
for d in word_files:
os.rename(f'{dirOrigem}\\{"".join(d)}', f'{dirDestinoWord}\\{"".join(d)}')
I tried your program and it doesn't work as it is on my computer. I changed some lines and it works. Hope it helps
import os
from glob import glob
dirOrigem = r'C:\Users\fchal\Desktop\temp' # here I changed the code just because I didn't want to bother using input()
os.chdir(dirOrigem)
excel_files = glob('*.xlsx')
excel_files.extend(glob('*.xls'))
dirDestinoXL = dirOrigem + '\\' + 'xlsfile'
if not os.path.exists(dirDestinoXL):
os.makedirs(dirDestinoXL)
for i in excel_files:
os.rename(i, os.path.join(dirDestinoXL, i))
# same procedure for pdf and word files
I know that glob can be a mess sometimes. And if the files are open, you can get errors. Here's what I would do:
import os
def move_files_with_extension(from_dir, to_dir, *extensions):
if not os.path.isdir(from_dir):
raise ValueError('{} is not a real directory'.format(from_dir))
elif not os.path.isdir(to_dir):
raise ValueError('{} is not a real directory'.format(to_dir))
files_with_extensions = all_files_with_extensions_in(from_dir, *extensions)
for file_path in files_with_extensions:
os.rename(file_path, os.path.join(to_dir, os.path.basename(file_path)))
def all_files_with_extensions_in(dir, *extensions):
files_with_extensions = list()
for dir_path, dir_names, file_names in os.walk(dir):
for file_name in file_names:
if file_name.endswith(extensions):
files_with_extensions.append(os.path.join(dir_path, file_name))
return files_with_extensions
and then you can do:
dirOrigem = input()
excel_location = os.path.join(dirOrigem, 'Planilhas Excel')
move_files_with_extension(dirOrigem, excel_location, '.xls', '.xlsx')
and so on
This is one of my first python projects, and i'm trying to make a script that would write a script which can re-create the src/ directory. this would be what i distribute to users. It uses walk, and writes a python file that first creates all the directories, and then writes the files. The issue i have is making the the files into a single string that i can write to a file.
This is the program i have:
import os
import pickle
src = os.path.dirname(os.path.realpath(__file__)) + os.sep + 'src'
fPack = 'import os \nimport pickle \nmyDir = os.path.dirname(os.path.realpath(__file__))'
Pack =''
print 'Packing ' + src
pickle
for root, dirs, files in os.walk(src, topdown=True):
for name in files:
print os.path.join(root, name)
f = open(os.path.join(root, name), 'r')
Pack = Pack + '\nf = open(os.path.join(myDir,\'' + name + '\'), \'w\')'
fileCont = pickle.dumps(f.read())
Pack = Pack + '\nf.write(pickle.loads(\'' + fileCont + '\'))'
for name in dirs:
print os.path.join(root, name)
fPack = fPack + '\nos.makedirs(os.path.join(myDir,\'' + name + '\'))'
print '==================================================\n\n\n'
print fPack + Pack
f = open(os.getcwd() + os.sep + 'dist' + os.sep + 'Pack.py', 'w')
f.write(fPack)
f.write(Pack)
And if i run it in a directory with on subdirectory, and on file inside it creates this file
import os
import pickle
myDir = os.path.dirname(os.path.realpath(__file__))
os.makedirs(os.path.join(myDir,'SphereText'))
f = open(os.path.join(myDir,'TextMain.py'), 'w')
f.write(pickle.loads('S"########################################################\n#Main SphereText file. #\n#SpereText is a simple Notepad-Like plain text editor #\n########################################################\n\nfrom Tkinter import *\nfrom tkFileDialog import *\nimport tkSimpleDialog\n\nroot = Tk()\nroot.title('SphereText')\n\ndef fSave():\n fileName = asksaveasfilename(parent=root)\n f = open(fileName, 'w')\n f.write(text.get(1.0,END))\n\ndef fOpen():\n fileName = ''\n fileName = askopenfilename(parent=root)\n f = open(fileName, 'r')\n text.delete(1.0,END)\n text.insert(1.0, f.read())\n\ndef tReplace():\n Old = tkSimpleDialog.askstring('SphereText', 'Replace:')\n print Old\n New = tkSimpleDialog.askstring('SphereText', 'With:')\n print New\n content = text.get(1.0,END)\n content = content.replace(Old, New)\n text.delete(1.0,END)\n text.insert(1.0, content)\n \nmenubar = Menu(root)\nmenubar.add_command(label='Save', command=fSave)\nmenubar.add_command(label='Open', command=fOpen)\nmenubar.add_command(label='Replace', command=tReplace)\nroot.config(menu=menubar)\n\ntext = Text(root, wrap=WORD)\n\ntext.pack()\n\nroot.mainloop()\n"
p0
.'))
The 's aren't escaped, and there are two line breaks at the end. i thought that the whole point of serializing was that you could always read it back the same way. Anyone know how i can mak the file a valid string?
Sorry about the newbish question, i just found out i had been trying to reinvent the wheel. apparently, that already exists under the name Squeeze.
I have the same problem as here but now I'm trying to do the same with python because it's more suited to the task.
I've started with this:
import os
import shutil
import random
import glob
root_dir = '/home/leonardo/Desktop/python_script/rfe'
output_dir = '/home/leonardo/Desktop/python_script/output_folder'
ref = 200
folders_root_dir = os.listdir(root_dir)
print folders_root_dir
count = len(folders_root_dir)
print count
for i in xrange(count):
folder_inside = root_dir + '/' + folders_root_dir[i]
print folder_inside
number_files_folder_inside = len(os.listdir(folder_inside))
print number_files_folder_inside
if number_files_folder_inside > ref:
ref_copy = round(0.2*number_files_folder_inside)
print ref_copy
# here I have to copy 20% of the files in this folder to the output folder
else:
# here I have to copy all files from the folder to the output_dir
I tried to use os.walk() but I'm new to python and selecting files while the function is working proved to be really tough.
You'll need to import these:
import os
import shutil
import random
You can get all the files in a directory like this:
files = [file for file in os.listdir(dir) if os.path.isfile(os.path.join(dir, file))]
Then use a conditional:
if len(files) < 200:
for file in files:
shutil.copyfile(os.path.join(dir, file), dst)
else:
# Amount of random files you'd like to select
random_amount = 1000
for x in xrange(random_amount):
if len(files) == 0:
break
else:
file = random.choice(files)
shutil.copyfile(os.path.join(dir, file), outputdir)
A more compact solution (also noticing that copyfile does not really do the job properly unless one specifies the target file name as well):
import os
import shutil
import random
def get_file_list(input_dir):
return [file for file in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, file))]
def get_random_files(file_list, N):
return random.sample(file_list, N)
def copy_files(random_files, input_dir, output_dir):
for file in random_files:
shutil.copy(os.path.join(input_dir, file), output_dir)
def main(input_dir, output_dir, N):
file_list = get_file_list(input_dir)
random_files = get_random_files(file_list, N)
copy_files(random_files, input_dir, output_dir)
import os
import shutil
import random
root_dir = '/home/leonardo/Desktop/python_script/qar'
output_dir = '/home/leonardo/Desktop/python_script/output_folder'
ref = 1
for root, dirs, files in os.walk(root_dir):
number_of_files = len(os.listdir(root))
if number_of_files > ref:
ref_copy = int(round(0.2 * number_of_files))
for i in xrange(ref_copy):
chosen_one = random.choice(os.listdir(root))
file_in_track = root
file_to_copy = file_in_track + '/' + chosen_one
if os.path.isfile(file_to_copy) == True:
shutil.copy(file_to_copy,output_dir)
print file_to_copy
else:
for i in xrange(len(files)):
track_list = root
file_in_track = files[i]
file_to_copy = track_list + '/' + file_in_track
if os.path.isfile(file_to_copy) == True:
shutil.copy(file_to_copy,output_dir)
print file_to_copy
print 'Finished !'
The final code has this face
thank you guys for the help !
cheers !
I want this for splitting my dataset to train,test and validation.
here is my code :
import os
import shutil
import random
import numpy as np
dir = r'E:\down\imgs'
train_dir = r'E:/train_test_split/train'
test_dir = r'E:/train_test_split/test'
valid_dir = r'E:/train_test_split/validation'
files = [file for file in os.listdir(dir) if os.path.isfile(os.path.join(dir, file))]
train_count = np.round(50/100*len(files))
test_count = np.round(30/100*len(files))
valid_count = np.round(20/100*len(files))
rndnums = list(random.sample(range(0, len(files)), len(files)))
print("len(files)",len(files))
# print("all",len(files))
# print("train",np.round(train*len(files)))
# print("test",np.round(test*len(files)))
# print("valid",np.round(valid*len(files)))
#
# print("sum",np.round(train*len(files)) + np.round(test*len(files)) + np.round(valid*len(files)))
# Amount of random files you'd like to select
##train_files
print(rndnums)
train_file_index = rndnums[0:int(train_count)+1]
train_file_name = [files[i] for i in train_file_index]
test_file_index = rndnums[int(train_count)+1:int(train_count + test_count)+1]
test_file_name = [files[i] for i in test_file_index]
valid_file_index = rndnums[int(train_count + test_count)+1:]
valid_file_name = [files[i] for i in valid_file_index]
for x in train_file_name:
file = x
shutil.copyfile(os.path.join(dir, file), os.path.join(train_dir, file))
##test_files
for y in test_file_name:
file = y
shutil.copyfile(os.path.join(dir, file), os.path.join(test_dir, file))
##valid_files
for z in valid_file_name:
file = z
shutil.copyfile(os.path.join(dir, file), os.path.join(valid_dir, file))
maybe something like (untested)
import os
THRESHOLD = 200
root_dir = "\home..."
output_dir = "\home....."
for top, dirs, nondirs in os.walk(root_dir):
for name in nondirs[:THRESHOLD]:
path = os.path.join(top, name)
destination = os.path.join(output_dir, name)
os.rename(path, destination)
import random
import shutil
import os
rootdir = '/home/leonardo/Desktop/python_script/qar'
outdir = '/home/leonardo/Desktop/python_script/output_folder'
ref = 200
dirsAndFiles = {} # here we store a structure {folder: [file1, file2], folder2: [file2, file4] }
dirs = [x[0] for x in os.walk(rootdir)] # here we store all sub-dirs
for dir in dirs:
dirsAndFiles[dir] = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
for (dir, files) in dirsAndFiles.iteritems():
if len(files) > ref:
for i in xrange(int(0.2*len(files))): # copy 20% of files
fe = random.choice(files)
files.remove(fe)
shutil.copy(os.path.join(dir, fe), outdir)
else: # copy all files
for file in files:
shutil.copy(os.path.join(dir, file), outdir)