Non-recursive method of mapping remote / SharePoint folder - python

Background:
I need to list all files and folders within a SharePoint document library, and carry out some operation on the file during the mapping process (as I would need to check for file existence if these are carried out after mapping takes place).
Right now, my solution to this is to recursively map the folder using the Office365-REST-Python-Client and the map_folder method, like so:
import sys
from typing import Any, Callable
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
from office365.sharepoint.folders.folder import Folder
from office365.sharepoint.files.file import File
class SharePointHandler():
def __init__(self, username:str, company_site:str, password:str) -> None:
self.username = username
self.company_site = company_site
self.client_context = self.create_client_context(user_password=password)
def create_client_context(self, user_password) -> None:
try:
ctx_authorization = AuthenticationContext(self.company_site)
ctx_authorization.acquire_token_for_user(self.username, user_password) # Deprecated --> must update
client_context = ClientContext(self.company_site, ctx_authorization)
return client_context
except Exception as e:
sys.exit(1)
def map_folder(self, to_map:str) -> tuple[list[str], list[str]]:
file_list, folder_list = [], []
def enum_folder(parent_folder):
parent_folder.expand(["Files", "Folders", "ModifiedBy"]).get().execute_query()
for file in parent_folder.files:
file_list.append(file.serverRelativeUrl)
for folder in parent_folder.folders:
folder_list.append(folder.serverRelativeUrl)
enum_folder(folder)
root_folder = self.client_context.web.get_folder_by_server_relative_url(to_map)
enum_folder(root_folder)
return file_list, folder_list
To accommodate the aforementioned file operations, I do the following:
def map_folder(self, to_map:str, primary_method:Callable[[File], None]) -> tuple[list[str], list[str]]:
file_list, folder_list = [], []
def enum_folder(parent_folder):
parent_folder.expand(["Files", "Folders", "ModifiedBy"]).get().execute_query()
for file in parent_folder.files:
primary_method(file)
file_list.append(file.serverRelativeUrl)
for folder in parent_folder.folders:
folder_handler_list.append(folder.serverRelativeUrl)
enum_folder(folder)
root_folder = self.client_context.web.get_folder_by_server_relative_url(to_map)
enum_folder(root_folder)
return file_list, folder_list
The above method is similar to the one suggested in the library examples, but is obviously not ideal in the event of a very large, very nested folder (which I unfortunately have).
My question:
To avoid potential overflow issues, I would like to do this in a non-recursive / iterative manner using something like collections.deque to aid with stack handling, but am struggling to wrap my head around its implementation in a case like this. Is this possible?
Note:
I only mention the intermediate operations because, in my mind, a solution may impede my ability to do this. However, if this is the price to pay, so be it. I am able to carry out file operations after the fact.

Related

PyYAML: Custom/default constructor for nodes without tags

I have a YAML file for storing constants. Some of the entries have custom tags such as !HANDLER or !EXPR and these are easily handled by adding constructors to the YAML loader.
However, I also want to have a custom constructor for non-tagged nodes. Reason being, I want to add these non-tagged values to a dictionary for use elsewhere. These values need to be available before parsing finishes hence I can't just let parsing finish and then update the dictionary.
So with a YAML file like
sample_rate: 16000
input_file: !HANDLER
handler_fn: file_loader
handler_input: path/to/file
mode: w
I have a handler constructor
def file_handler_loader(loader, node):
params = loader.construct_mapping(node)
module = __import__('handlers.file_handlers', fromlist=[params.pop('handler_fn')])
func = getattr(module, params.pop('handler_fn'))
handler_input = params.pop('handler_input')
return func(handler_input, **params)
And a function initialize_constants
def _get_loader():
loader = FullLoader
loader.add_constructor('!HANDLER', file_handler_loader)
loader.add_constructor('!EXPR', expression_loader)
return loader
def initialize_constants(path_to_yaml: str) -> None:
try:
with open(path_to_yaml, 'r') as yaml_file:
constants = yaml.load(yaml_file, Loader=_get_loader())
except FileNotFoundError as ex:
LOGGER.error(ex)
exit(-1)
The goal is then to have a constructor for non-tagged entries in the YAML. I haven't been able to figure out though how to add a constructor for non-tagged entries. Ideally, the code would look like below
def default_constructor(loader, node):
param = loader.construct_scalar(node)
constants[node_name] = param
I've also attempted to add an resolver to solve the problem. The code below was tested but didn't work as expected.
loader.add_constructor('!DEF', default_constructor)
loader.add_implicit_resolver('!DEF', re.compile('.*'), first=None)
def default_constructor(loader, node):
# do stuff
In this case what happened was the node contained the value sample_rate and not the 16000 as expected.
Thanks in advance :)

Adding subtitles to video with moviepy requiring encoding

I have a srt file named subtitles.srt that is non-English, and I followed the instructions of the documentation and source code of the moviepy package (https://moviepy.readthedocs.io/en/latest/_modules/moviepy/video/tools/subtitles.html):
from moviepy.video.tools.subtitles import SubtitlesClip
from moviepy.video.io.VideoFileClip import VideoFileClip
generator = lambda txt: TextClip(txt, font='Georgia-Regular', fontsize=24, color='white')
sub = SubtitlesClip("subtitles.srt", generator, encoding='utf-8')
And this gives the error TypeError: __init__() got an unexpected keyword argument 'encoding'.
In the source code, the class SubtitlesClip does have a keyword argument encoding. Does that mean the version of the source code is outdated or something? And what can I do about this? I even attempted to copy the source code for moviepy.video.tools.subtitles with the encoding keyword argument directly to my code, yet it led to more errors like at the line:
from moviepy.decorators import convert_path_to_string
it failed to import the decorator convert_path_to_string.
The source code does not seem to agree with what I have installed. Anyway to fix it? If not, are there any good alternatives of Python libraries for inserting subtitles or video editing in general?
Edit: My current solution is to create a child class of SubtitlesClip and override the constructor of the parent class:
from moviepy.video.tools.subtitles import SubtitlesClip
from moviepy.video.VideoClip import TextClip, VideoClip
from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
import re
from moviepy.tools import cvsecs
def file_to_subtitles_with_encoding(filename):
""" Converts a srt file into subtitles.
The returned list is of the form ``[((ta,tb),'some text'),...]``
and can be fed to SubtitlesClip.
Only works for '.srt' format for the moment.
"""
times_texts = []
current_times = None
current_text = ""
with open(filename,'r',encoding='utf-8') as f:
for line in f:
times = re.findall("([0-9]*:[0-9]*:[0-9]*,[0-9]*)", line)
if times:
current_times = [cvsecs(t) for t in times]
elif line.strip() == '':
times_texts.append((current_times, current_text.strip('\n')))
current_times, current_text = None, ""
elif current_times:
current_text += line
return times_texts
class SubtitlesClipUTF8(SubtitlesClip):
def __init__(self, subtitles, make_textclip=None):
VideoClip.__init__(self, has_constant_size=False)
if isinstance(subtitles, str):
subtitles = file_to_subtitles_with_encoding(subtitles)
#subtitles = [(map(cvsecs, tt),txt) for tt, txt in subtitles]
self.subtitles = subtitles
self.textclips = dict()
if make_textclip is None:
make_textclip = lambda txt: TextClip(txt, font='Georgia-Bold',
fontsize=24, color='white',
stroke_color='black', stroke_width=0.5)
self.make_textclip = make_textclip
self.start=0
self.duration = max([tb for ((ta,tb), txt) in self.subtitles])
self.end=self.duration
def add_textclip_if_none(t):
""" Will generate a textclip if it hasn't been generated asked
to generate it yet. If there is no subtitle to show at t, return
false. """
sub =[((ta,tb),txt) for ((ta,tb),txt) in self.textclips.keys()
if (ta<=t<tb)]
if not sub:
sub = [((ta,tb),txt) for ((ta,tb),txt) in self.subtitles if
(ta<=t<tb)]
if not sub:
return False
sub = sub[0]
if sub not in self.textclips.keys():
self.textclips[sub] = self.make_textclip(sub[1])
return sub
def make_frame(t):
sub = add_textclip_if_none(t)
return (self.textclips[sub].get_frame(t) if sub
else np.array([[[0,0,0]]]))
def make_mask_frame(t):
sub = add_textclip_if_none(t)
return (self.textclips[sub].mask.get_frame(t) if sub
else np.array([[0]]))
self.make_frame = make_frame
hasmask = bool(self.make_textclip('T').mask)
self.mask = VideoClip(make_mask_frame, ismask=True) if hasmask else None
I actually only changed two lines, but I have to create a new class and redefine the whole thing, so I doubt whether it's really necessary. Any better solution than this?
The latest version in the documentation (the one you are looking at) corresponds to a dev version 2.x, which is not released to PyPI yet. The version you have installed through pip is most likely 1.0.3, which is the latest on PyPI, and it doesn't allow an encoding parameter.
From the PR where the feature was introduced, you can see that it's only been tagged for release in 2.x versions.
Copying only that file to your source code will most likely not work, because it will depend on changes that happened in between the two versions. However, if you feel adventurous, you can install the dev version of the package, by following the Method by hand section in moviepy's docs.

How to perform a task after flask app starts?

I'm trying to execute a task after the app does the binding with the port so it doesn't get killed by Heroku for taking too long on startup. I am aware of the existence of before_first_request however I would like this action to be performed as soon as possible after the app startup without requiring a request.
I am loading an object as an attribute of the app object (because I need to access it across requests) and this object has to initialize in a weird way (it checks if a file exists and downloads it if it doesn't and afterwards it performs a bunch of computations).
Currently I'm doing this in the following way:
def create_app() -> Flask:
...
with app.app_context():
app.model = RecommenderModel() # This downloads a pretty heavy file if it isn't there
app.model.load_products() # This performs a bunch of calculations
...
return app
This initializes the app properly (as tested locally) however Heroku kills it (Error R10) because it takes too long.
Is there a way to do this asynchronously? When I tried to do so the app context got lost.
Edit: Additional information regarding what I'm doing:
The RecommenderModel object models the logic of a recommendation system. As of now, the recommendations are based on vector cosine similarity. Those vectors are extracted using pre-trained word2vec embeddings (which is the large file that needs to be downloaded). The conversion from products to vectors is handled by a Preprocessor class.
The Recommender Model initialization looks like this:
class RecommenderModel(object):
def __init__(self) -> None:
self.preproc = Preprocessor()
self.product_vector: dict = {}
def load_products(self) -> None:
for product in Product.get_all():
self.product_vector[product.id] = self.preproc.compute_vector(product)
The Preprocessor initialization looks like this:
class Preprocessor(object):
def __init__(self, embeddings: str = embeddings) -> None:
S3.ensure_file(embeddings)
self.vectors = KeyedVectors.load_word2vec_format(embeddings)
The S3.ensure_file method basically checks if the file exists and downloads it if it doesn't:
class S3(object):
client = boto3.client('s3')
#classmethod
def ensure_file(cls, filepath: str) -> None:
if os.path.exists(filepath):
return
dirname, filename = os.path.split(filepath)
bucket_name = os.environ.get('BUCKET_NAME')
cls.client.download_file(bucket_name, filename, filepath)

How to avoid re-downloading media to S3 in Scrapy?

I previously asked a similar question (How does Scrapy avoid re-downloading media that was downloaded recently?), but since I did not receive a definite answer I'll ask it again.
I've downloaded a large number of files to an AWS S3 bucket using Scrapy's Files Pipeline. According to the documentation (https://doc.scrapy.org/en/latest/topics/media-pipeline.html#downloading-and-processing-files-and-images), this pipeline avoids "re-downloading media that was downloaded recently", but it does not say how long ago "recent" is or how to set this parameter.
Looking at the implementation of the FilesPipeline class at https://github.com/scrapy/scrapy/blob/master/scrapy/pipelines/files.py, it would appear that this is obtained from the FILES_EXPIRES setting, for which the default is 90 days:
class FilesPipeline(MediaPipeline):
"""Abstract pipeline that implement the file downloading
This pipeline tries to minimize network transfers and file processing,
doing stat of the files and determining if file is new, uptodate or
expired.
`new` files are those that pipeline never processed and needs to be
downloaded from supplier site the first time.
`uptodate` files are the ones that the pipeline processed and are still
valid files.
`expired` files are those that pipeline already processed but the last
modification was made long time ago, so a reprocessing is recommended to
refresh it in case of change.
"""
MEDIA_NAME = "file"
EXPIRES = 90
STORE_SCHEMES = {
'': FSFilesStore,
'file': FSFilesStore,
's3': S3FilesStore,
}
DEFAULT_FILES_URLS_FIELD = 'file_urls'
DEFAULT_FILES_RESULT_FIELD = 'files'
def __init__(self, store_uri, download_func=None, settings=None):
if not store_uri:
raise NotConfigured
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
cls_name = "FilesPipeline"
self.store = self._get_store(store_uri)
resolve = functools.partial(self._key_for_pipe,
base_class_name=cls_name,
settings=settings)
self.expires = settings.getint(
resolve('FILES_EXPIRES'), self.EXPIRES
)
if not hasattr(self, "FILES_URLS_FIELD"):
self.FILES_URLS_FIELD = self.DEFAULT_FILES_URLS_FIELD
if not hasattr(self, "FILES_RESULT_FIELD"):
self.FILES_RESULT_FIELD = self.DEFAULT_FILES_RESULT_FIELD
self.files_urls_field = settings.get(
resolve('FILES_URLS_FIELD'), self.FILES_URLS_FIELD
)
self.files_result_field = settings.get(
resolve('FILES_RESULT_FIELD'), self.FILES_RESULT_FIELD
)
super(FilesPipeline, self).__init__(download_func=download_func, settings=settings)
#classmethod
def from_settings(cls, settings):
s3store = cls.STORE_SCHEMES['s3']
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
s3store.POLICY = settings['FILES_STORE_S3_ACL']
store_uri = settings['FILES_STORE']
return cls(store_uri, settings=settings)
def _get_store(self, uri):
if os.path.isabs(uri): # to support win32 paths like: C:\\some\dir
scheme = 'file'
else:
scheme = urlparse(uri).scheme
store_cls = self.STORE_SCHEMES[scheme]
return store_cls(uri)
def media_to_download(self, request, info):
def _onsuccess(result):
if not result:
return # returning None force download
last_modified = result.get('last_modified', None)
if not last_modified:
return # returning None force download
age_seconds = time.time() - last_modified
age_days = age_seconds / 60 / 60 / 24
if age_days > self.expires:
return # returning None force download
Do I understand this correctly? Also, I do not see a similar Boolean statement with age_days in the S3FilesStore class; is the checking of age also implemented for files on S3? (I was also unable to find any tests testing this age-checking feature for S3).
FILES_EXPIRES is indeed the setting to tell the FilesPipeline how "old" can a file be before downloading it (again).
The key section of the code is in media_to_download:
the _onsuccess callback checks the result of the pipeline's self.store.stat_file call, and for your question, it especially looks for the "last_modified" info. If last modified is older than "expires days", then the download is triggered.
You can check how the S3store gets the "last modified" information. It depends if botocore is available or not.
One line answer to this would be - class FilesPipeline(MediaPipeline): is the only class responsible for managing, validating and downloading files in your local paths. class S3FilesStore(object): just gets the files from local paths and uploads them to S3.
class FSFilesStore is the one which manages all your local paths and FilesPipeline uses them to store your files at local.
Links:
https://github.com/scrapy/scrapy/blob/master/scrapy/pipelines/files.py#L264
https://github.com/scrapy/scrapy/blob/master/scrapy/pipelines/files.py#L397
https://github.com/scrapy/scrapy/blob/master/scrapy/pipelines/files.py#L299

embedding resources in python scripts

I'd like to figure out how to embed binary content in a python script. For instance, I don't want to have any external files around (images, sound, ... ), I want all this content living inside of my python scripts.
Little example to clarify, let's say I got this small snippet:
from StringIO import StringIO
from PIL import Image, ImageFilter
embedded_resource = StringIO(open("Lenna.png", "rb").read())
im = Image.open(embedded_resource)
im.show()
im_sharp = im.filter(ImageFilter.SHARPEN)
im_sharp.show()
As you can see, the example is reading the external file 'Lenna.png'
Question
How to proceed to embed "Lenna.png" as a resource (variable) into my python script. What's the fastest way to achieve this simple task using python?
You might find the following class rather useful for embedding resources in your program. To use it, call the package method with paths to the files that you want to embed. The class will print out a DATA attribute that should be used to replace the one already found in the class. If you want to add files to your pre-built data, use the add method instead. To use the class in your program, make calls to the load method using context manager syntax. The returned value is a Path object that can be used as a filename argument to other functions or for the purpose of directly loading the reconstituted file. See this SMTP Client for example usage.
import base64
import contextlib
import pathlib
import pickle
import pickletools
import sys
import zlib
class Resource:
"""Manager for resources that would normally be held externally."""
WIDTH = 76
__CACHE = None
DATA = b''
#classmethod
def package(cls, *paths):
"""Creates a resource string to be copied into the class."""
cls.__generate_data(paths, {})
#classmethod
def add(cls, *paths):
"""Include paths in the pre-generated DATA block up above."""
cls.__preload()
cls.__generate_data(paths, cls.__CACHE.copy())
#classmethod
def __generate_data(cls, paths, buffer):
"""Load paths into buffer and output DATA code for the class."""
for path in map(pathlib.Path, paths):
if not path.is_file():
raise ValueError('{!r} is not a file'.format(path))
key = path.name
if key in buffer:
raise KeyError('{!r} has already been included'.format(key))
with path.open('rb') as file:
buffer[key] = file.read()
pickled = pickle.dumps(buffer, pickle.HIGHEST_PROTOCOL)
optimized = pickletools.optimize(pickled)
compressed = zlib.compress(optimized, zlib.Z_BEST_COMPRESSION)
encoded = base64.b85encode(compressed)
cls.__print(" DATA = b'''")
for offset in range(0, len(encoded), cls.WIDTH):
cls.__print("\\\n" + encoded[
slice(offset, offset + cls.WIDTH)].decode('ascii'))
cls.__print("'''")
#staticmethod
def __print(line):
"""Provides alternative printing interface for simplicity."""
sys.stdout.write(line)
sys.stdout.flush()
#classmethod
#contextlib.contextmanager
def load(cls, name, delete=True):
"""Dynamically loads resources and makes them usable while needed."""
cls.__preload()
if name not in cls.__CACHE:
raise KeyError('{!r} cannot be found'.format(name))
path = pathlib.Path(name)
with path.open('wb') as file:
file.write(cls.__CACHE[name])
yield path
if delete:
path.unlink()
#classmethod
def __preload(cls):
"""Warm up the cache if it does not exist in a ready state yet."""
if cls.__CACHE is None:
decoded = base64.b85decode(cls.DATA)
decompressed = zlib.decompress(decoded)
cls.__CACHE = pickle.loads(decompressed)
def __init__(self):
"""Creates an error explaining class was used improperly."""
raise NotImplementedError('class was not designed for instantiation')
The best way to go about this is converting your picture into a python string, and have it in a separate file called something like resources.py, then you simply parse it.
If you are looking to embed the whole thing inside a single binary, then you're looking at something like py2exe. Here is an example embedding external files
In the first scenario, you could even use base64 to (de)code the picture, something like this:
import base64
file = open('yourImage.png');
encoded = base64.b64encode(file.read())
data = base64.b64decode(encoded) # Don't forget to file.close() !

Categories

Resources