Copying files from SMB server to local drive(Linux/Windows) in python

Copying files from SMB server to local drive(Linux/Windows) in python - python

I am using smb module to connect to smb server. I am unable to figure out what exactly I could do to copy files from smb to my local drive as I am using linux machine.
import tempfile
from smb.SMBConnection import SMBConnection
from nmb.NetBIOS import NetBIOS
conn = SMBConnection('salead',
'repo#2k12',
'192.168.14.1',
'SERVER',
use_ntlm_v2=True)
assert conn.connect('192.168.1.41', 139)
if conn:
print "successfull",conn
else:
print "failed to connect"
If anyone can help me out it would be a great help for me. Thanks

According to some documentation, SMBConnection.retrieveFile() is the function you are searching for.
Example:
# UNTESTED
conn = SMBConnection('salead',
'repo#2k12',
'192.168.14.1',
'SERVER',
use_ntlm_v2 = True)
assert conn.connect('192.168.1.41', 139)
with open('local_file', 'wb') as fp:
conn.retrieveFile('share', '/path/to/remote_file', fp)
Documentation: http://pysmb.readthedocs.io/en/latest/api/smb_SMBConnection.html
Example (in Japanese): http://symfoware.blog68.fc2.com/blog-entry-999.html

The documentation referenced above by #Rob should get you there. Here is an idea that works using an abstracted class:
> pip install pysmb
import subprocess
from smb import SMBConnection
class SMBClient:
def __init__(self, ip, username, password, servername, share_name):
self._ip = ip
self._username = username
self._password = password
self._port = 445
self._share_name = share_name
self._servername = servername
self._server = ''
def _connect(self):
""" Connect and authenticate to the SMB share. """
self._server = SMBConnection(username=self._username,
password=self._password,
my_name=self._get_localhost(),
remote_name=self._servername,
use_ntlm_v2=True)
self._server.connect(self._ip, port=self._port)
def _download(self, files: list):
""" Download files from the remote share. """
for file in files:
with open(file, 'wb') as file_obj:
self._server.retrieveFile(service_name=self._share_name,
path=file,
file_obj=file_obj)
def _get_localhost(self):
self._host = subprocess.Popen(['hostname'],stdout=subprocess.PIPE).communicate()[0].strip()
Then, all you need to do is this:
filename = [the file you want to download]
smb_client = SMBClient(ip='192.168.14.1', username='salead', password='repo#2k12', servername='SERVER', share_name='SHARE_NAME')
smb_client._connect()
response = smb_client._download(filename)
Main Documentation: https://pysmb.readthedocs.io/en/latest/index.html
SMBConnection Documentation: https://pysmb.readthedocs.io/en/latest/api/smb_SMBConnection.html

In my case, it was necessary to copy files from one remote machine to another remote machine. Both remote machines are managed by Windows OS
import tempfile
from smb.SMBConnection import SMBConnection
client_machine_name = 'mylocalmachinename'
# first domen settings
userID_1 = 'admin_1'
password_1 = '123'
server_name_1 = 'SERVER_1'
server_ip_1 = '192.168.1.1'
# path to remote file in windows format we want to read
# "D:\myfiles\dir\somefile.txt". In this case, full access
# to disk d is open. But if you have published only a separate
# catalog, in our example "myfiles", then:
# share_resource_1 = 'myfiles'
# path_to_file_1 = '/dir/somefile.txt'
share_resource_1 = 'd'
path_to_file_1 = '/myfiles/dir/somefile.txt'
# second domen settings
userID_2 = 'admin_2'
password_2 = '123'
server_name_2 = 'SERVER_2'
server_ip_2 = '192.168.1.2'
# path to remote file we want to save in windows format "D:\myfiles\dir\somefile_copy.txt"
share_resource_2 = 'd'
path_to_file_2 = '/myfiles/dir/somefile_copy.txt'
conn_1 = SMBConnection(
userID_1, password_1, client_machine_name, server_name_1, use_ntlm_v2 = True
)
conn_2 = SMBConnection(
userID_2, password_2, client_machine_name, server_name_2, use_ntlm_v2 = True
)
conn_1.connect(server_ip_1, 139)
conn_2.connect(server_ip_2, 139)
with tempfile.NamedTemporaryFile() as file_obj:
conn_1.retrieveFile(share_resource_1, path_to_file_1, file_obj)
# Note that the file obj is positioned at the end-of-file,
# so you might need to perform a file_obj.seek() if you need
# to read from the beginning
file_obj.seek(0)
conn_2.storeFile(share_resource_2, path_to_file_2, file_obj)

Related

FileNotFoundError even though Absolute Path Specified

I am running a simple python webserver [SimpleHTTPServer] in my Linux . Wrote a python program to download all the files hosted in that server to my Windows Machine . But for some reason program is throwing FileNotFoundError even though Directory exists and I've provided Absolute Path .
Here is the code : https://drive.google.com/file/d/1CDrueDJcbu2z1XeeB_iYv0zmfIX1cCkx/view?usp=sharing
It's working correctly in Linux but trouble with Windows . Thanks
import requests
import argparse
from sys import argv
from urllib.parse import unquote
import os
from time import time
import random
from colorama import Fore, Style
import platform
def formatFiles(name):
name = name[13:-9]
nameLen = len(name) - 2
nameLen = int(nameLen/2)
name = name[:nameLen]
return name
# Creating a Temporary Folder to Download all the files in it
def fileCreate(saveFolder):
random.seed(int(time()))
text = ""
for x in range(5):
y = random.randrange(65,91)
text += chr(y)
saveFolder += text
os.popen("mkdir {}".format(saveFolder))
print("Temp Directory {} created to save files/folders".format(text))
return saveFolder
def winDows(endPoint, banner):
resp = requests.get(endPoint, allow_redirects=True)
resp = resp.text.split("\n")
resp = list(map(unquote, resp[10:-5])) #URL decoding using unquote
resp = list(map(formatFiles,resp))
for dir in resp:
tempPath = ""
tempEndpoint = endPoint[len(serverURL):] # Getting directory structure by removing IP:PORT in URL
tempPath = "\\".join(tempEndpoint.split("/")) # Removing / and adding \\ for Windows path
print(banner + dir)
tdir = dir
if(dir[-1] == "/"):
if( dir.split(" ")!=1 ): # If the directory name has spaces ,
tdir = dir[:-1]
tdir = "\""+tdir+"\""+"\\"
os.popen("mkdir "+saveFolder+"\\"+tempPath+tdir)
r = winDows(endPoint+dir, banner[:-4]+" |___")
else:
data = open(saveFolder+"\\"+tempPath+dir, "wb")
fileData = requests.get(endPoint+dir, allow_redirects=True)
data.write(fileData.content)
data.close()
return 1
parser = argparse.ArgumentParser()
sideBanner = " |___ "
parser.add_argument("ip", help = "IP address of FTP Server", type=ip_valid)
parser.add_argument("port" , help = "FTP Server Port you want to access", type=port_valid)
parser.add_argument("dst", help="Destination Path to save your files")
args = parser.parse_args()
ip = argv[1]
port = argv[2]
saveFolder = argv[3]
serverURL = "http://"+ip+":"+port+"/"
saveFolder = fileCreate(saveFolder)
print("Destination Folder - {}".format(saveFolder))
if(platform.system() == "Linux"):
linuX(serverURL, sideBanner)
else:
winDows(serverURL, sideBanner)

Python for/while loop

Today i am working on a project about incoming phone calls being transcripted and getting saved into text files, but i am also kinda new to python and python loops.
I want to loop over a SQL server column and let each row loop trough the azure Speech to text service i use (all of the phonecall OID's). I have been stuck on this problem for a couple days now so i thought i might find some help here.
import azure.cognitiveservices.speech as speechsdk
import time
from os import path
from pydub import AudioSegment
import requests
import hashlib
import sys
import os.path
import pyodbc
databaseName = '*'
username = '*'
password = '*'
server = '*'
driver = '*'
try:
CONNECTION_STRING = 'DRIVER='+driver+';SERVER='+server+';DATABASE='+databaseName+';UID='+username+';PWD='+ password
conn = pyodbc.connect(CONNECTION_STRING)
cursor = conn.cursor()
storedproc = "* = *'"
cursor.execute(storedproc)
row = cursor.fetchone()
while row:
array = [(int(row[1]))]
row = cursor.fetchone()
i = 0
while i<len(array):
OID = (array[i])
i = i + 1
print(OID)
string = f"{OID}*"
encoded = string.encode()
result = hashlib.sha256(encoded)
resultHash = (result.hexdigest())
Telefoongesprek = requests.get(f"*{OID}", headers={f"api-key":f"{resultHash}"})
with open("Telefoongesprek.mp3", "wb") as f:
f.write(Telefoongesprek.content)
src = "Telefoongesprek.mp3"
dst = "Telefoongesprek.wav"
sound = AudioSegment.from_file(src)
sound.export(dst, format="wav")
def speech_recognize_continuous_from_file():
speech_config = speechsdk.SpeechConfig(subscription="*", region="*")
speech_config.speech_recognition_language = "nl-NL"
audio_config = speechsdk.audio.AudioConfig(filename="Telefoongesprek.wav")
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
done = False
def stop_cb(evt):
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
all_results = []
def handle_final_result(evt):
all_results.append(evt.result.text)
speech_recognizer.recognized.connect(handle_final_result)
speech_recognizer.session_started.connect(handle_final_result)
speech_recognizer.session_stopped.connect(handle_final_result)
speech_recognizer.canceled.connect(handle_final_result)
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
print(all_results)
telefoongesprek = str(all_results)
filename = f"C:\\Users\\Beau\\Contact-verkeer\\contact-verkeer\\telefoon\\STT Transcriptions\\Telefoongesprek#{OID}.txt"
file = open(filename, "w")
file.write(telefoongesprek)
file.close()
speech_recognize_continuous_from_file()
cursor.close()
del cursor
conn.close()
except Exception as e:
print("Error: %s" % e)
everything works apart form each other but i just dont know how to place the loop and witch one i should use (For/While loop). right here im trying to loop over an array but i dont this this is correct.
Error message: Decoding failed. ffmpeg returned error code: 1
[mp3 # 000001cb8c57e0o0] Failed to read frame size: could not seek to 1073.
which i am pretty sure means that my azure function can't find an mp3 file, what means that the "Mp3 to Wav" convert doesn't work.
Thanks in advance!

If I understand your question, you have a database with lots of phone call details. One of the field value in each row is used to create the associated mp3 file. You want to do speech to text using azure on each of the mp3 file you have in your database.
So you can do it in two ways:
Iterate though all rows in the database and create all the associted files into a folder in the local disk with the OID as your filename.
Then write another loop to iterate through this folder and send the files for transcription to Azure Speech to Text service.
The other technique is to do everything in a single loop like the way you have shown which will require some corrections.
Ok, so now that part is clear, we can go into the speech to text part. So azure allow you to send the compressed format for transcription, which means you actually don't need to convert it into wav file.
Please have a look at the modified code below with the changes:
# code snippet borrowed from azure samples
def speech_recognize_continuous_from_file(filename):
class BinaryFileReaderCallback(speechsdk.audio.PullAudioInputStreamCallback):
def __init__(self, filename: str):
super().__init__()
self._file_h = open(filename, "rb")
def read(self, buffer: memoryview) -> int:
try:
size = buffer.nbytes
frames = self._file_h.read(size)
buffer[:len(frames)] = frames
return len(frames)
except Exception as ex:
print('Exception in `read`: {}'.format(ex))
raise
def close(self) -> None:
print('closing file')
try:
self._file_h.close()
except Exception as ex:
print('Exception in `close`: {}'.format(ex))
raise
# Creates an audio stream format. For an example we are using MP3 compressed file here
compressed_format = speechsdk.audio.AudioStreamFormat(compressed_stream_format=speechsdk.AudioStreamContainerFormat.MP3)
callback = BinaryFileReaderCallback(filename=filename)
stream = speechsdk.audio.PullAudioInputStream(stream_format=compressed_format, pull_stream_callback=callback)
speech_config = speechsdk.SpeechConfig(subscription="*", region="*")
speech_config.speech_recognition_language = "nl-NL"
audio_config = speechsdk.audio.AudioConfig(stream=stream)
# Creates a speech recognizer using a file as audio input, also specify the speech language
speech_recognizer = speechsdk.SpeechRecognizer(speech_config, audio_config)
done = False
def stop_cb(evt):
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
all_results = []
def handle_final_result(evt):
all_results.append(evt.result.text)
speech_recognizer.recognized.connect(handle_final_result)
speech_recognizer.session_started.connect(handle_final_result)
speech_recognizer.session_stopped.connect(handle_final_result)
speech_recognizer.canceled.connect(handle_final_result)
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
print(all_results)
telefoongesprek = str(all_results)
filename = f"C:\\Users\\Beau\\Contact-verkeer\\contact-verkeer\\telefoon\\STT Transcriptions\\Telefoongesprek#{OID}.txt"
file = open(filename, "w")
file.write(telefoongesprek)
file.close()
try:
CONNECTION_STRING = 'DRIVER='+driver+';SERVER='+server+';DATABASE='+databaseName+';UID='+username+';PWD='+ password
conn = pyodbc.connect(CONNECTION_STRING)
cursor = conn.cursor()
storedproc = "* = *'"
cursor.execute(storedproc)
row = cursor.fetchone()
# loop through the rows
while row:
array = [(int(row[1]))]
i = 0
while i<len(array):
OID = (array[i])
i = i + 1
print(OID)
string = f"{OID}*"
encoded = string.encode()
result = hashlib.sha256(encoded)
resultHash = (result.hexdigest())
telefoongesprek_response = requests.get(f"*{OID}", headers={f"api-key":f"{resultHash}"})
# save the file to local disk as mp3
with open("Telefoongesprek.mp3", "wb") as f:
f.write(telefoongesprek_response.content)
# do the speech to text on the mp3 file
speech_recognize_continuous_from_file(f.name)
# fetch the next row
row = cursor.fetchone()
cursor.close()
del cursor
conn.close()
except Exception as e:
print("Error: %s" % e)
I haven't tested this full code as i don't have the db connections with me. Please fell free to modify for your use case and let me know if you have any issues.

Why does desktop.getCurrentComponent() return None in PyUNO?

Trying to revive a PyUNO sample script called Wavelet to learn how LO works nowadays and get re-started. Since LibreOffice & UNO changed a bit from the script's creation time I am running into problems.
Managed to get the desktop object. Now I want to retrieve the open document's component. How do I achieve this properly? The desktop.getCurrentComponent() call returns None.
LibreOffice version: 6.4.6.2.
System: Ubuntu MATE 20.04 x86_64.
The code follows:
#!/usr/bin/python3
def TestWave(event):
wavelet = Wavelet(XSCRIPTCONTEXT)
wavelet.trigger( () )
import uno
import unohelper
import string
from com.sun.star.task import XJobExecutor
class Wavelet( unohelper.Base, XJobExecutor ):
def __init__( self, ctx ):
self.ctx = ctx
def trigger( self, args ):
desktop = self.ctx.ServiceManager.createInstanceWithContext(
"com.sun.star.frame.Desktop", self.ctx )
doc = desktop.getCurrentComponent()
print('doc:', doc)
#try:
search = doc.createSearchDescriptor()
search.SearchRegularExpression = True
search.SearchString = "\\<(k|s|v|z|o|u|i|a) "
found = doc.findFirst( search )
while found:
print("found:", found.String)
found.String = string.replace( found.String, " ", u"\xa0" )
found = doc.findNext( found.End, search)
#except:
# pass
g_ImplementationHelper = unohelper.ImplementationHelper()
g_ImplementationHelper.addImplementation(
Wavelet,
"name.vojta.openoffice.Wavelet",
("com.sun.star.task.Job",),)
if __name__ == "__main__":
import os
# Start OpenOffice.org, listen for connections and open testing document
os.system( "loffice '--accept=socket,host=localhost,port=2002;urp;' --writer ./WaveletTest.odt &" )
# Get local context info
localContext = uno.getComponentContext()
resolver = localContext.ServiceManager.createInstanceWithContext(
"com.sun.star.bridge.UnoUrlResolver", localContext )
ctx = None
# Wait until the OO.o starts and connection is established
while ctx == None:
try:
ctx = resolver.resolve(
"uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext" )
except:
pass
# Trigger our job
wavelet = Wavelet( ctx )
wavelet.trigger( () )
Output:
doc: None
Traceback (most recent call last):
File "./wavelet.py", line 62, in <module>
wavelet.trigger( () )
File "./wavelet.py", line 24, in trigger
search = doc.createSearchDescriptor()
AttributeError: 'NoneType' object has no attribute 'createSearchDescriptor'
Edit 1
Cross posted at the following address:
https://ask.libreoffice.org/en/question/283785/why-does-desktopgetcurrentcomponent-return-none-in-pyuno/

Try without giving desktop.getCurrentComponent() a variable, so erase doc =. I remember I had that problem, but did not understand why it was doing it. All I remember is that not naming it made my code work. That is the only advice I can give you.

Tried to wait until the document component becomes available. And it worked:
doc = None
while doc is None:
doc = desktop.getCurrentComponent()

most efficient way to create file and delete file of same name in for loop python

I download data from aws s3 then unzip, format data then try to send that unzipped, formatted file to aws elasticsearch.
For each file it will create it on my local computer as "access.log", delete it after unzip, formatting, send to aws elasticsearch then repeat the process.
However when I try to do this with same name it tells me that I cannot delete while working with a file, I do not want to create multiple files. What would be the most efficient way to do this?
NOTE: acc_logs is a list of many log files. ex: acc_log_file.gz
here is my code:
for log_file in acc_logs: # one file
bucket.download_file(log_file, 'C:\\Users\\name\\Desktop\\s3-to-es\\access.log')
with gzip.open('C:\\Users\\name\\Desktop\\s3-to-es\\access.log') as log_file:
for line in log_file:
line = line.decode("utf-8") # decode byte to str
try:
ip = ip_pattern.search(line).group(0)
except:
ip = None
try:
host = host_pattern.search(line).group(0)[1:-1]
except:
host = None
date = time_pattern.search(line).group(0)[1:-1]
pos = [x.end() for x in re.finditer('"', line)]
request = line[pos[0]:pos[1]-1]
referr_url = line[pos[2]:pos[3]-1]
user_agent = line[pos[4]:pos[5]-1]
status, bytes = line[pos[1]+1:pos[2]-2].split()
ident, authuser = authuser_pattern.search(line).group(0).split()
document = {"server_ip": ip,
"host":host,
"ident":ident,
"authuser":authuser,
"date": date,
"request":request,
"status":status,
"bytes":bytes,
"referr_url":referr_url,
"user_agent":user_agent
}
ToElasticSearch(document) #upload each line to es
if count == 10:
break
os.remove('C:\\Users\\name\\Desktop\\s3-to-es\\access.log')
ToElasticSearch looks like:
def ToElasticSearch(document):
"""
upload given document to specified index in AWS
"""
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token)
es = Elasticsearch(
hosts = [{'host': es_host, 'port': 443}],
http_auth = awsauth,
use_ssl = True,
verify_certs = True,
connection_class = RequestsHttpConnection
)
es.index(index="acc_logs", doc_type="_doc", body=document)

You are trying to delete the file while it is still open:
with gzip.open('C:\\Users\\name\\Desktop\\s3-to-es\\access.log') as log_file:
for line in log_file:
...blah...
os.remove('C:\\Users\\name\\Desktop\\s3-to-es\\access.log')
If you change the indent on the os.remove() line outwards one level (less spaces), the the file will not be open and it should delete just fine.

"Not implemented" Exception when using pywin32 to control Adobe Acrobat

I have written a script in python using pywin32 to save pdf files to text that up until recently was working fine. I use similar methods in Excel. The code is below:
def __pdf2Txt(self, pdf, fileformat="com.adobe.acrobat.accesstext"):
outputLoc = os.path.dirname(pdf)
outputLoc = os.path.join(outputLoc, os.path.splitext(os.path.basename(pdf))[0] + '.txt')
try:
win32com.client.gencache.EnsureModule('{E64169B3-3592-47d2-816E-602C5C13F328}', 0, 1, 1)
adobe = win32com.client.DispatchEx('AcroExch.App')
pdDoc = win32com.client.DispatchEx('AcroExch.PDDoc')
pdDoc.Open(pdf)
jObject = pdDoc.GetJSObject()
jObject.SaveAs(outputLoc, "com.adobe.acrobat.accesstext")
except:
traceback.print_exc()
return False
finally:
del jObject
pdDoc.Close()
del pdDoc
adobe.Exit()
del adobe
However this code has suddenly stopped working and I get the following output:
Traceback (most recent call last):
File "C:\Documents and Settings\ablishen\workspace\HooverKeyCreator\src\HooverKeyCreator.py", line 38, in __pdf2Txt
jObject.SaveAs(outputLoc, "com.adobe.acrobat.accesstext")
File "C:\Python27\lib\site-packages\win32com\client\dynamic.py", line 505, in __getattr__
ret = self._oleobj_.Invoke(retEntry.dispid,0,invoke_type,1)
com_error: (-2147467263, 'Not implemented', None, None)
False
I have similar code written in VB that works correctly so I'm guessing that it has something to do with the COM interfaces not binding to the appropriate functions correctly? (my COM knowledge is patchy).

Blish, this thread holds the key to the solution you are looking for: https://mail.python.org/pipermail/python-win32/2002-March/000260.html
I admit that the post above is not the easiest to find (probably because Google scores it low based on the age of the content?).
Specifically, applying this piece of advice will get things running for you: https://mail.python.org/pipermail/python-win32/2002-March/000265.html
For reference, the complete piece of code that does not require you to manually patch dynamic.py (snippet should run pretty much out of the box):
# gets all files under ROOT_INPUT_PATH with FILE_EXTENSION and tries to extract text from them into ROOT_OUTPUT_PATH with same filename as the input file but with INPUT_FILE_EXTENSION replaced by OUTPUT_FILE_EXTENSION
from win32com.client import Dispatch
from win32com.client.dynamic import ERRORS_BAD_CONTEXT
import winerror
# try importing scandir and if found, use it as it's a few magnitudes of an order faster than stock os.walk
try:
from scandir import walk
except ImportError:
from os import walk
import fnmatch
import sys
import os
ROOT_INPUT_PATH = None
ROOT_OUTPUT_PATH = None
INPUT_FILE_EXTENSION = "*.pdf"
OUTPUT_FILE_EXTENSION = ".txt"
def acrobat_extract_text(f_path, f_path_out, f_basename, f_ext):
avDoc = Dispatch("AcroExch.AVDoc") # Connect to Adobe Acrobat
# Open the input file (as a pdf)
ret = avDoc.Open(f_path, f_path)
assert(ret) # FIXME: Documentation says "-1 if the file was opened successfully, 0 otherwise", but this is a bool in practise?
pdDoc = avDoc.GetPDDoc()
dst = os.path.join(f_path_out, ''.join((f_basename, f_ext)))
# Adobe documentation says "For that reason, you must rely on the documentation to know what functionality is available through the JSObject interface. For details, see the JavaScript for Acrobat API Reference"
jsObject = pdDoc.GetJSObject()
# Here you can save as many other types by using, for instance: "com.adobe.acrobat.xml"
jsObject.SaveAs(dst, "com.adobe.acrobat.accesstext")
pdDoc.Close()
avDoc.Close(True) # We want this to close Acrobat, as otherwise Acrobat is going to refuse processing any further files after a certain threshold of open files are reached (for example 50 PDFs)
del pdDoc
if __name__ == "__main__":
assert(5 == len(sys.argv)), sys.argv # <script name>, <script_file_input_path>, <script_file_input_extension>, <script_file_output_path>, <script_file_output_extension>
#$ python get.txt.from.multiple.pdf.py 'C:\input' '*.pdf' 'C:\output' '.txt'
ROOT_INPUT_PATH = sys.argv[1]
INPUT_FILE_EXTENSION = sys.argv[2]
ROOT_OUTPUT_PATH = sys.argv[3]
OUTPUT_FILE_EXTENSION = sys.argv[4]
# tuples are of schema (path_to_file, filename)
matching_files = ((os.path.join(_root, filename), os.path.splitext(filename)[0]) for _root, _dirs, _files in walk(ROOT_INPUT_PATH) for filename in fnmatch.filter(_files, INPUT_FILE_EXTENSION))
# patch ERRORS_BAD_CONTEXT as per https://mail.python.org/pipermail/python-win32/2002-March/000265.html
global ERRORS_BAD_CONTEXT
ERRORS_BAD_CONTEXT.append(winerror.E_NOTIMPL)
for filename_with_path, filename_without_extension in matching_files:
print "Processing '{}'".format(filename_without_extension)
acrobat_extract_text(filename_with_path, ROOT_OUTPUT_PATH, filename_without_extension, OUTPUT_FILE_EXTENSION)
I have tested this on WinPython x64 2.7.6.3, Acrobat X Pro

makepy.py is a script that comes with the win32com python package.
Running it for your installation "wires" python into the COM/OLE object in Windows. The following is an excerpt of some code I used to talk to Excel and do some stuff in it. This example gets the name of sheet 1 in the current workbook. It automatically runs makepy if it has an exception:
import win32com;
import win32com.client;
from win32com.client import selecttlb;
def attachExcelCOM():
makepyExe = r'python C:\Python25\Lib\site-packages\win32com\client\makepy.py';
typeList = selecttlb.EnumTlbs();
for tl in typeList:
if (re.match('^Microsoft.*Excel.*', tl.desc, re.IGNORECASE)):
makepyCmd = "%s -d \"%s\"" % (makepyExe, tl.desc);
os.system(makepyCmd);
# end if
# end for
# end def
def getSheetName(sheetNum):
try:
xl = win32com.client.Dispatch("Excel.Application");
wb = xl.Workbooks.Item(sheetNum);
except Exception, detail:
print 'There was a problem attaching to Excel, refreshing connect config...';
print Exception, str(detail);
attachExcelCOM();
try:
xl = win32com.client.Dispatch("Excel.Application");
wb = xl.Workbooks.Item(sheetNum);
except:
print 'Could not attach to Excel...';
sys.exit(-1);
# end try/except
# end try/except
wsName = wb.Name;
if (wsName == 'PERSONAL.XLS'):
return( None );
# end if
print 'The target worksheet is:';
print ' ', wsName;
print 'Is this correct? [Y/N]',;
answer = string.strip( sys.stdin.readline() );
answer = answer.upper();
if (answer != 'Y'):
print 'Sheet not identified correctly.';
return(None);
# end if
return( (wb, wsName) );
# end def
# -- Main --
sheetInfo = getSheetName(sheetNum);
if (sheetInfo == None):
print 'Sheet not found';
sys.exit(-1);
else:
(wb, wsName) = sheetInfo;
# end if

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Copying files from SMB server to local drive(Linux/Windows) in python - python

Related

FileNotFoundError even though Absolute Path Specified

Python for/while loop

Why does desktop.getCurrentComponent() return None in PyUNO?

most efficient way to create file and delete file of same name in for loop python

"Not implemented" Exception when using pywin32 to control Adobe Acrobat

Categories

Resources