How to run multiple Azure Functions in parallel which scroll through Elasticsearch? - python

I have a setup where I need to extract data from Elasticsearch and store it on an Azure Blob. Now to get the data I am using Elasticsearch's _search and _scroll API. The indexes are pretty well designed and are formatted something like game1.*, game2.*, game3.* etc.
I've created a worker.py file which I stored in a folder called shared_code as Microsoft suggests and I have several Timer Trigger Functions which import and call worker.py. Due to the way ES was setup on our side I had to create a VNET and a static Outbound IP address which we've then whitelisted on ES. Conversely, the data is only available to be extracted from ES only on port 9200. So I've created an Azure Function App which has the connection setup and I am trying to create multiple Functions (game1-worker, game2-worker, game3-worker) to pull the data from ES running in parallel on minute 5. I've noticed if I add the FUNCTIONS_WORKER_PROCESS_COUNT = 1 setting then the functions will wait until the first triggered one finishes its task and then the second one triggers. If I don't add this app setting or increase the number, then once a function stopped because it finished working, it will try to start it again and then I get a OSError: [WinError 10048] Only one usage of each socket address (protocol/network address/port) is normally permitted error. Is there a way I can make these run in parallel but not have the mentioned error?
Here is the code for the worker.py:
#!/usr/bin/env python
# coding: utf-8
# # Elasticsearch to Azure Microservice
import json, datetime, gzip, importlib, os, re, logging
from elasticsearch import Elasticsearch
import azure.storage.blob as azsb
import azure.identity as azi
import os
import tempfile
def batch(game_name, env='prod'):
# #### Global Variables
env = env.lower()
connection_string = os.getenv('conn_storage')
lowerFormat = game_name.lower().replace(" ","_")
azFormat = re.sub(r'[^0-9a-zA-Z]+', '-', game_name).lower()
storageContainerName = azFormat
stateStorageContainerName = "azure-webjobs-state"
minutesOffset = 5
tempFilePath = tempfile.gettempdir()
curFileName = f"{lowerFormat}_cursor.py"
curTempFilePath = os.path.join(tempFilePath,curFileName)
curBlobFilePath = f"cursors/{curFileName}"
esUrl = os.getenv('esUrl')
# #### Connections
es = Elasticsearch(
esUrl,
port=9200,
timeout=300)
def uploadJsonGzipBlob(filePathAndName, jsonBody):
blob = azsb.BlobClient.from_connection_string(
conn_str=connection_string,
container_name=storageContainerName,
blob_name=filePathAndName
)
blob.upload_blob(gzip.compress(bytes(json.dumps(jsonBody), encoding='utf-8')))
def getAndLoadCursor(filePathAndName):
# Get cursor from blob
blob = azsb.BlobClient.from_connection_string(
conn_str=os.getenv('AzureWebJobsStorage'),
container_name=stateStorageContainerName,
blob_name=filePathAndName
)
# Stream it to Temp file
with open(curTempFilePath, "wb") as f:
data = blob.download_blob()
data.readinto(f)
# Load it by path
spec = importlib.util.spec_from_file_location("cursor", curTempFilePath)
cur = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cur)
return cur
def writeCursor(filePathAndName, body):
blob = azsb.BlobClient.from_connection_string(
conn_str=os.getenv('AzureWebJobsStorage'),
container_name=stateStorageContainerName,
blob_name=filePathAndName
)
blob.upload_blob(body, overwrite=True)
# Parameter and state settings
if os.getenv(f"{lowerFormat}_maxSizeMB") is None:
maxSizeMB = 10 # Default to 10 MB
else:
maxSizeMB = int(os.getenv(f"{lowerFormat}_maxSizeMB"))
if os.getenv(f"{lowerFormat}_maxProcessTimeSeconds") is None:
maxProcessTimeSeconds = 300 # Default to 300 seconds
else:
maxProcessTimeSeconds = int(os.getenv(f"{lowerFormat}_maxProcessTimeSeconds"))
try:
cur = getAndLoadCursor(curBlobFilePath)
except Exception as e:
dtStr = f"{datetime.datetime.utcnow():%Y/%m/%d %H:%M:00}"
writeCursor(curBlobFilePath, f"# Please use format YYYY/MM/DD HH24:MI:SS\nlastPolled = '{dtStr}'")
logging.info(f"No cursor file. Generated {curFileName} file with date {dtStr}")
return 0
# # Scrolling and Batching Engine
lastRowDateOffset = cur.lastPolled
nrFilesThisInstance = 0
while 1:
# Offset the current time by -5 minutes to account for the 2-3 min delay in Elasticsearch
initTime = datetime.datetime.utcnow()
## Filter lt (less than) endDate to avoid infinite loops.
## Filter lt manually when compiling historical based on
endDate = initTime-datetime.timedelta(minutes=minutesOffset)
endDate = f"{endDate:%Y/%m/%d %H:%M:%S}"
doc = {
"query": {
"range": {
"baseCtx.date": {
"gt": lastRowDateOffset,
"lt": endDate
}
}
}
}
Index = lowerFormat + ".*"
if env == 'dev': Index = 'dev.' + Index
if nrFilesThisInstance == 0:
page = es.search(
index = Index,
sort = "baseCtx.date:asc",
scroll = "2m",
size = 10000,
body = doc
)
else:
page = es.scroll(scroll_id = sid, scroll = "10m")
pageSize = len(page["hits"]["hits"])
data = page["hits"]["hits"]
sid = page["_scroll_id"]
totalSize = page["hits"]["total"]
print(f"Total Size: {totalSize}")
cnt = 0
# totalSize might be flawed as it returns at times an integer > 0 but array is empty
# To overcome this, I've added the below check for the array size instead
if pageSize == 0: break
while 1:
cnt += 1
page = es.scroll(scroll_id = sid, scroll = "10m")
pageSize = len(page["hits"]["hits"])
sid = page["_scroll_id"]
data += page["hits"]["hits"]
sizeMB = len(gzip.compress(bytes(json.dumps(data), encoding='utf-8'))) / (1024**2)
loopTime = datetime.datetime.utcnow()
processTimeSeconds = (loopTime-initTime).seconds
print(f"{cnt} Results pulled: {pageSize} -- Cumulative Results: {len(data)} -- Gzip Size MB: {sizeMB} -- processTimeSeconds: {processTimeSeconds} -- pageSize: {pageSize} -- startDate: {lastRowDateOffset} -- endDate: {endDate}")
if sizeMB > maxSizeMB: break
if processTimeSeconds > maxProcessTimeSeconds: break
if pageSize < 10000: break
lastRowDateOffset = max([x['_source']['baseCtx']['date'] for x in data])
lastRowDateOffsetDT = datetime.datetime.strptime(lastRowDateOffset, '%Y/%m/%d %H:%M:%S')
outFile = f"elasticsearch/live/{lastRowDateOffsetDT:%Y/%m/%d/%H}/{lowerFormat}_live_{lastRowDateOffsetDT:%Y%m%d%H%M%S}.json.gz"
uploadJsonGzipBlob(outFile, data)
writeCursor(curBlobFilePath, f"# Please use format YYYY/MM/DD HH24:MI:SS\nlastPolled = '{lastRowDateOffset}'")
nrFilesThisInstance += 1
logging.info(f"File compiled: {outFile} -- {sizeMB} MB\n")
# If the while loop ran for more than maxProcessTimeSeconds then end it
if processTimeSeconds > maxProcessTimeSeconds: break
if pageSize < 10000: break
logging.info(f"Closing Connection to {esUrl}")
es.close()
return 0
And these are 2 of the timing triggers I am calling:
game1-worker
import logging
import datetime
import azure.functions as func
#from shared_code import worker
import importlib
def main(mytimer: func.TimerRequest) -> None:
utc_timestamp = datetime.datetime.utcnow().replace(
tzinfo=datetime.timezone.utc).isoformat()
if mytimer.past_due:
logging.info('The timer is past due!')
# Load a new instance of worker.py
spec = importlib.util.spec_from_file_location("worker", "shared_code/worker.py")
worker = importlib.util.module_from_spec(spec)
spec.loader.exec_module(worker)
worker.batch('game1name')
logging.info('Python timer trigger function ran at %s', utc_timestamp)
game2-worker
import logging
import datetime
import azure.functions as func
#from shared_code import worker
import importlib
def main(mytimer: func.TimerRequest) -> None:
utc_timestamp = datetime.datetime.utcnow().replace(
tzinfo=datetime.timezone.utc).isoformat()
if mytimer.past_due:
logging.info('The timer is past due!')
# Load a new instance of worker.py
spec = importlib.util.spec_from_file_location("worker", "shared_code/worker.py")
worker = importlib.util.module_from_spec(spec)
spec.loader.exec_module(worker)
worker.batch('game2name')
logging.info('Python timer trigger function ran at %s', utc_timestamp)

TL;DR
Based on what you described, multiple worker-processes share underlying runtime's resources (sockets).
For your usecase you just need to leave FUNCTIONS_WORKER_PROCESS_COUNT at 1. Default value is supposed to be 1, so not specifying it should mean the same as setting it to 1.
You need to understand how Azure Functions scale. It is very unnatural/confusing.
Assumes Consumption Plan.
Coding: You write Functions. Say F1 an F2. How you organize is up to you.
Provisioning:
You create a Function App.
You deploy F1 and F2 to this App.
You start the App. (not function).
Runtime:
At start
Azure spawns one Function Host. Think of this as a container/OS.
Inside the Host, one worker-process is created. This worker-process will host one instance of App.
If you change FUNCTIONS_WORKER_PROCESS_COUNT to say 10 then Host will spawn 10 processes and run your App inside each of them.
When a Function is triggered (function could be triggered due to timer, or REST calls or message in Q, ...)
Each worker-process is capable of servicing one request at a time. Be it a request for F1 or F2. One at a time.
Each Host is capable servicing one request per worker-process in it.
If backlog of requests grows, then Azure load balancer would trigger scale-out and create new Function Hosts.
Based on limited info, it seems like bad design to create 3 functions. You could instead create a single timer-triggered function, which sends out 3 messages to a Q (Storage Q should be more than plenty for such minuscule traffic), which in turn triggers your actual Function/implementation (which is storage Q triggered Function). Message would be something like {"game_name": "game1"}.

Related

amp azure playback speed python

Im trying to change play back speed in azure amp.
The following is the url generated from azure apis: https://ampdemo.azureedge.net/?url=https://testingmedia-usea.streaming.media.azure.net/bbd51d47-cc1a-4515-bac8-4053040f8c58/ignite.ism/manifest(format=mpd-time-cmaf,filter=filter1)&heuristicprofile=lowlatency
if you check that link there is no playback speed.
I saw the below link but dont know where to apply in python code
https://amp.azure.net/libs/amp/latest/docs/index.html#amp.player.options.playbackspeed
below is my code:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.mgmt.media import AzureMediaServices
from azure.storage.blob import BlobServiceClient
from azure.mgmt.media.models import (
Asset,
Transform,
TransformOutput,
BuiltInStandardEncoderPreset,
Job,
JobInputAsset,
JobOutputAsset,
OnErrorType,
Priority,
StreamingLocator,
AssetFilter,
PresentationTimeRange,
)
import os
import random
#Timer for checking job progress
import time
import requests
#Get environment variables
load_dotenv()
default_credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True)
# Get the environment variables SUBSCRIPTIONID, RESOURCEGROUP and ACCOUNTNAME
subscription_id = os.getenv('SUBSCRIPTIONID')
resource_group = os.getenv('RESOURCEGROUP')
account_name = os.getenv('ACCOUNTNAME')
# The file you want to upload. For this example, put the file in the same folder as this script.
# The file ignite.mp4 has been provided for you.
source_file = "https://testingmedia.blob.core.windows.net/data/ignite.mp4"
#url = requests.get(source_file)
# This is a random string that will be added to the naming of things so that you don't have to keep doing this during testing
uniqueness = "streamAssetFilters-" + str(random.randint(0,9999))
# Change this to your specific streaming endpoint name if not using "default"
streaming_endpoint_name = "default"
# Set the attributes of the input Asset using the random number
in_asset_name = 'inputassetName' + uniqueness
in_alternate_id = 'inputALTid' + uniqueness
in_description = 'inputdescription' + uniqueness
# Create an Asset object
# The asset_id will be used for the container parameter for the storage SDK after the asset is created by the AMS client.
in_asset = Asset(alternate_id=in_alternate_id, description=in_description)
# Set the attributes of the output Asset using the random number
out_asset_name = 'outputassetName' + uniqueness
out_alternate_id = 'outputALTid' + uniqueness
out_description = 'outputdescription' + uniqueness
# Create an output asset object
out_asset = Asset(alternate_id=out_alternate_id, description=out_description)
# The AMS Client
print("Creating AMS Client")
client = AzureMediaServices(default_credential, subscription_id)
# Create an input Asset
print(f"Creating input asset {in_asset_name}")
input_asset = client.assets.create_or_update(resource_group, account_name, in_asset_name, in_asset)
# An AMS asset is a container with a specific id that has "asset-" prepended to the GUID.
# So, you need to create the asset id to identify it as the container
# where Storage is to upload the video (as a block blob)
in_container = 'asset-' + input_asset.asset_id
# create an output Asset
print(f"Creating output asset {out_asset_name}")
output_asset = client.assets.create_or_update(resource_group, account_name, out_asset_name, out_asset)
### Use the Storage SDK to upload the video ###
print(f"Uploading the file {source_file}")
blob_service_client = BlobServiceClient.from_connection_string(os.getenv('STORAGEACCOUNTCONNECTION'))
blob_client = blob_service_client.get_blob_client(in_container, "ignite.mp4")
# working_dir = os.getcwd() + "\Media"
# print(working_dir)
# print(f"Current working directory: {working_dir}")
# upload_file_path = os.path.join(working_dir, source_file)
# print(upload_file_path,"####")
# WARNING: Depending on where you are launching the sample from, the path here could be off, and not include the BasicEncoding folder.
# Adjust the path as needed depending on how you are launching this python sample file.
# Upload the video to storage as a block blob
#with open(url, "rb") as data:
blob_client.upload_blob_from_url(source_file)
transform_name = 'ContentAwareEncodingAssetFilters'
# Create a new Standard encoding Transform for Built-in Copy Codec
print(f"Creating Encoding transform named: {transform_name}")
# For this snippet, we are using 'BuiltInStandardEncoderPreset'
transform_output = TransformOutput(
preset=BuiltInStandardEncoderPreset(
preset_name="ContentAwareEncoding"
),
# What should we do with the job if there is an error?
on_error=OnErrorType.STOP_PROCESSING_JOB,
# What is the relative priority of this job to others? Normal, high or low?
relative_priority=Priority.NORMAL
)
print("Creating encoding transform...")
# Adding transform details
my_transform = Transform()
my_transform.description="Transform with Asset filters"
my_transform.outputs = [transform_output]
print(f"Creating transform {transform_name}")
transform = client.transforms.create_or_update(
resource_group_name=resource_group,
account_name=account_name,
transform_name=transform_name,
parameters=my_transform)
print(f"{transform_name} created (or updated if it existed already). ")
job_name = 'ContentAwareEncodingAssetFilters'+ uniqueness
print(f"Creating custom encoding job {job_name}")
files = (source_file)
# Create Job Input and Ouput Assets
input = JobInputAsset(asset_name=in_asset_name)
outputs = JobOutputAsset(asset_name=out_asset_name)
# Create the job object and then create transform job
the_job = Job(input=input, outputs=[outputs])
job: Job = client.jobs.create(resource_group, account_name, transform_name, job_name, parameters=the_job)
# Check job state
job_state = client.jobs.get(resource_group, account_name, transform_name, job_name)
# First check
print("First job check")
print(job_state.state)
# Check the state of the job every 10 seconds. Adjust time_in_seconds = <how often you want to check for job state>
def countdown(t):
while t:
mins, secs = divmod(t, 60)
timer = '{:02d}:{:02d}'.format(mins, secs)
print(timer, end="\r")
time.sleep(1)
t -= 1
job_current = client.jobs.get(resource_group, account_name, transform_name, job_name)
if(job_current.state == "Finished"):
print(job_current.state)
# TODO: Download the output file using blob storage SDK
return
if(job_current.state == "Error"):
print(job_current.state)
# TODO: Provide Error details from Job through API
return
else:
print(job_current.state)
countdown(int(time_in_seconds))
time_in_seconds = 10
countdown(int(time_in_seconds))
print(f"Creating locator for streaming...")
# Publish the output asset for streaming via HLS or DASH
locator_name = f"locator-{uniqueness}"
# Create the Asset filters
print("Creating an asset filter...")
asset_filter_name = 'filter1'
# Create the asset filter
asset_filter = client.asset_filters.create_or_update(
resource_group_name=resource_group,
account_name=account_name,
asset_name=out_asset_name,
filter_name=asset_filter_name,
parameters=AssetFilter(
# In this sample, we are going to filter the manifest by the time range of the presentation using the default timescale.
# You can adjust these settings for your own needs. Not that you can also control output tracks, and quality levels with a filter.
tracks=[],
# start_timestamp = 100000000 and end_timestamp = 300000000 using the default timescale will generate
# a play-list that contains fragments from between 10 seconds and 30 seconds of the VoD presentation.
# If a fragment straddles the boundary, the entire fragment will be included in the manifest.
presentation_time_range=PresentationTimeRange(start_timestamp=100000000, end_timestamp=300000000)
)
)
if asset_filter:
print(f"The asset filter ({asset_filter_name}) was successfully created.")
print()
else:
raise ValueError("There was an issue creating the asset filter.")
if output_asset:
streaming_locator = StreamingLocator(asset_name=out_asset_name, streaming_policy_name="Predefined_DownloadAndClearStreaming",filters=list(asset_filter_name.split(" ")))
locator = client.streaming_locators.create(
resource_group_name=resource_group,
account_name=account_name,
streaming_locator_name=locator_name,
parameters=streaming_locator
)
if locator:
print(f"The streaming locator {locator_name} was successfully created!")
else:
raise Exception(f"Error while creating streaming locator {locator_name}")
if locator.name:
hls_format = "format=m3u8-cmaf"
dash_format = "format=mpd-time-cmaf"
# Get the default streaming endpoint on the account
streaming_endpoint = client.streaming_endpoints.get(
resource_group_name=resource_group,
account_name=account_name,
streaming_endpoint_name=streaming_endpoint_name
)
if streaming_endpoint.resource_state != "Running":
print(f"Streaming endpoint is stopped. Starting endpoint named {streaming_endpoint_name}")
client.streaming_endpoints.begin_start(resource_group, account_name, streaming_endpoint_name)
basename_tup = os.path.splitext(source_file) # Extracting the filename and extension
path_extension = basename_tup[1] # Setting extension of the path
manifest_name = os.path.basename(source_file).replace(path_extension, "")
print(f"The manifest name is: {manifest_name}")
manifest_base = f"https://{streaming_endpoint.host_name}/{locator.streaming_locator_id}/{manifest_name}.ism/manifest"
hls_manifest = ""
if asset_filter_name is None:
hls_manifest = f'{manifest_base}({hls_format})'
else:
hls_manifest = f'{manifest_base}({hls_format},filter={asset_filter_name})'
print(f"The HLS (MP4) manifest URL is: {hls_manifest}")
print("Open the following URL to playback the live stream in an HLS compliant player (HLS.js, Shaka, ExoPlayer) or directly in an iOS device")
print({hls_manifest})
print()
dash_manifest = ""
if asset_filter_name is None:
dash_manifest = f'{manifest_base}({dash_format})'
else:
dash_manifest = f'{manifest_base}({dash_format},filter={asset_filter_name})'
print(f"The DASH manifest URL is: {dash_manifest}")
print("Open the following URL to playback the live stream from the LiveOutput in the Azure Media Player")
print(f"https://ampdemo.azureedge.net/?url={dash_manifest}&heuristicprofile=lowlatency")
print()
else:
raise ValueError("Locator was not created or Locator name is undefined.")
There's an example on https://amp.azure.net/libs/amp/latest/samples/dynamic_playback_speed.html for how to use playback speed. This is also available at https://github.com/Azure-Samples/azure-media-player-samples/blob/master/html/dynamic_playback_speed.html.

Multiprocessing In Django Function

Is it possible to use multi processing in Django on a request.
#so if I send a request to http://127.0.0.1:8000/wallet_verify
def wallet_verify(request):
walelts = botactive.objects.all()
#here I check if the user want to be included in the process or not so if they set it to True then i'll include them else ignore.
for active in walelts:
check_active = active.active
if check_active == True:
user_is_active = active.user
#for the ones that want to be included I then go to get their key data.
I need to get both api and secret so then I loop through to get the data from active users.
database = Bybitapidatas.objects.filter(user=user_is_active)
for apikey in database:
apikey = apikey.apikey
for apisecret in database:
apisecret = apisecret.apisecret
#since I am making a request to an exchange endpoint I can only include one API and secret at a time . So for 1 person at a time this is why I want to run in parallel.
for a, b in zip(list(Bybitapidatas.objects.filter(user=user_is_active).values("apikey")), list(Bybitapidatas.objects.filter(user=user_is_active).values("apisecret"))):
session =spot.HTTP(endpoint='https://api-testnet.bybit.com/', api_key=a['apikey'], api_secret=b['apisecret'])
#here I check to see if they have balance to open trades if they have selected to be included.
GET_USDT_BALANCE = session.get_wallet_balance()['result']['balances']
for i in GET_USDT_BALANCE:
if 'USDT' in i.values():
GET_USDT_BALANCE = session.get_wallet_balance()['result']['balances']
idx_USDT = GET_USDT_BALANCE.index(i)
GET_USDTBALANCE = session.get_wallet_balance()['result']['balances'][idx_USDT]['free']
print(round(float(GET_USDTBALANCE),2))
#if they don't have enough balance I skip the user.
if round(float(GET_USDTBALANCE),2) < 11 :
pass
else:
session.place_active_order(
symbol="BTCUSDT",
side="Buy",
type="MARKET",
qty=10,
timeInForce="GTC"
)
How can I run this process in parallel while looping through the database to also get data for each individual user.
I am still new to coding so hope I explained that it makes sense.
I have tried multiprocessing and pools but then I get that the app has not started yet and I have to run it outside of wallet_verify is there a way to do it in wallet_verify
and when I send the Post Request.
Any help appreciated.
Filtering the Database to get Users who have set it to True
Listi - [1,3](these are user ID's Returned
processess = botactive.objects.filter(active=True).values_list('user')
listi = [row[0] for row in processess]
Get the Users from the listi and perform the action.
def wallet_verify(listi):
# print(listi)
database = Bybitapidatas.objects.filter(user = listi)
print("---------------------------------------------------- START")
for apikey in database:
apikey = apikey.apikey
print(apikey)
for apisecret in database:
apisecret = apisecret.apisecret
print(apisecret)
start_time = time.time()
session =spot.HTTP(endpoint='https://api-testnet.bybit.com/', api_key=apikey, api_secret=apisecret)
GET_USDT_BALANCE = session.get_wallet_balance()['result']['balances']
for i in GET_USDT_BALANCE:
if 'USDT' in i.values():
GET_USDT_BALANCE = session.get_wallet_balance()['result']['balances']
idx_USDT = GET_USDT_BALANCE.index(i)
GET_USDTBALANCE = session.get_wallet_balance()['result']['balances'][idx_USDT]['free']
print(round(float(GET_USDTBALANCE),2))
if round(float(GET_USDTBALANCE),2) < 11 :
pass
else:
session.place_active_order(
symbol="BTCUSDT",
side="Buy",
type="MARKET",
qty=10,
timeInForce="GTC"
)
print ("My program took", time.time() - start_time, "to run")
print("---------------------------------------------------- END")
return HttpResponse("Wallets verified")
Verifyt is what I use for the multiprocessing since I don't want it to run without being requested to run. also initialiser starts apps for each loop
def verifyt(request):
with ProcessPoolExecutor(max_workers=4, initializer=django.setup) as executor:
results = executor.map(wallet_verify, listi)
return HttpResponse("done")
```

Create a Windows Event listener with win32evtlog

I am trying to develop a listener for the Windows event log. It should wait until anything new is added and when this happens it should catch the new event and pass it as an object so I can create a handler. I have found some things online but nothing has worked so far. I am using win32evtlog and win32event.
The code I have so far is this:
import win32evtlog
import win32event
server = 'localhost' # name of the target computer to get event logs
logtype = 'Application' # 'Application' # 'Security'
hand = win32evtlog.OpenEventLog(server,logtype)
flags = win32evtlog.EVENTLOG_BACKWARDS_READ|win32evtlog.EVENTLOG_SEQUENTIAL_READ
total = win32evtlog.GetNumberOfEventLogRecords(hand)
print(total)
h_evt = win32event.CreateEvent(None, 1, 0, "evt0")
for x in range(10):
notify = win32evtlog.NotifyChangeEventLog(hand, h_evt)
wait_result = win32event.WaitForSingleObject(h_evt, win32event.INFINITE)
print("notify", notify)
The output after I run it and force one event to happen is this:
865
notify None
After this it gets stuck and does not catch any other events.
Thank you in advance for any help
I noticed 2 things. First, you create a manual reset event, but never reset it. Second, you should only need to call win32evtlog.NotifyChangeEventLog once.
import win32evtlog
import win32event
import win32api
server = 'localhost' # name of the target computer to get event logs
logtype = 'MyAppSource'
hand = win32evtlog.OpenEventLog(server, logtype)
flags = win32evtlog.EVENTLOG_FORWARDS_READ | win32evtlog.EVENTLOG_SEQUENTIAL_READ
total = win32evtlog.GetNumberOfEventLogRecords(hand)
print(total)
total += 1
h_evt = win32event.CreateEvent(None, False, False, "evt0")
notify = win32evtlog.NotifyChangeEventLog(hand, h_evt)
for x in range(10):
wait_result = win32event.WaitForSingleObject(h_evt, win32event.INFINITE)
readlog = win32evtlog.ReadEventLog(hand, flags, total)
for event in readlog:
print(f"{event.TimeGenerated.Format()} : [{event.RecordNumber}] : {event.SourceName}")
total += len(readlog)
win32evtlog.CloseEventLog(hand)
win32api.CloseHandle(h_evt)
You can change MyAppSource to your source name. For example, on my computer I have:
If I want to monitor "Dell", for example: logtype = 'Dell'
The method above will only work for first level sources. If you want to go to a deeper level, use win32evtlog.EvtSubscribe(). Here, I use it with a callback - borrowing the xml code from the answer linked to in the comments.
import win32evtlog
import xml.etree.ElementTree as ET
channel = 'Microsoft-Windows-Windows Defender/Operational'
def on_event(action, context, event_handle):
if action == win32evtlog.EvtSubscribeActionDeliver:
xml = ET.fromstring(win32evtlog.EvtRender(event_handle, win32evtlog.EvtRenderEventXml))
# xml namespace, root element has a xmlns definition, so we have to use the namespace
ns = '{http://schemas.microsoft.com/win/2004/08/events/event}'
event_id = xml.find(f'.//{ns}EventID').text
level = xml.find(f'.//{ns}Level').text
channel = xml.find(f'.//{ns}Channel').text
execution = xml.find(f'.//{ns}Execution')
process_id = execution.get('ProcessID')
thread_id = execution.get('ThreadID')
time_created = xml.find(f'.//{ns}TimeCreated').get('SystemTime')
print(f'Time: {time_created}, Level: {level} Event Id: {event_id}, Channel: {channel}, Process Id: {process_id}, Thread Id: {thread_id}')
print(xml.find(f'.//{ns}Data').text)
print()
handle = win32evtlog.EvtSubscribe(
channel,
win32evtlog.EvtSubscribeToFutureEvents,
None,
Callback = on_event)
# Wait for user to hit enter...
input()
win32evtlog.CloseEventLog(handle)

Streaming speech recognition with Google Speech-to-Text is leading to improperly timestamped transcripts

My Problem:
The web app I'm building relies on real-time transcription of a user's voice along with timestamps for when each word begins and ends.
Google's Speech-to-Text API has a limit of 4 minutes for streaming requests but I want users to be able to run their mic's for as long as 30 minutes if they so choose.
Thankfully, Google provides its own code examples for how to make successive requests to their Speech-to-Text API in a way that mimics endless streaming speech recognition.
I've adapted their Python infinite streaming example for my purposes (see below for my code). The timestamps provided by Google are pretty accurate but the issue is that when I exceed the streaming limit (4 minutes) and a new request is made, the timestamped transcript returned by Google's API from the new request is off by as much as 5 seconds or more.
Below is an example of the output when I adjust the streaming limit to 10 seconds (so a new request to Google's Speech-to-Text API begins every 10 seconds).
The timestamp you see printed next to each transcribed response (the 'corrected_time' in the code) is the timestamp for the end of the transcribed line, not the beginning. These timestamps are accurate for the first request but are off by ~4 seconds in the second request and ~9 seconds in the third request.
In a Nutshell, I want to make sure that when the streaming limit is exceeded and a new request is made, the timestamps returned by Google for that new request are adjusted accurately.
My Code:
To help you understand what's going on, I would recommend running it on your machine (only takes a couple of minutes to get working if you have a Google Cloud service account).
I've included more detail on my current diagnosis below the code.
#!/usr/bin/env python
"""Google Cloud Speech API sample application using the streaming API.
NOTE: This module requires the dependencies `pyaudio`.
To install using pip:
pip install pyaudio
Example usage:
python THIS_FILENAME.py
"""
# [START speech_transcribe_infinite_streaming]
import os
import re
import sys
import time
from google.cloud import speech
import pyaudio
from six.moves import queue
# Audio recording parameters
STREAMING_LIMIT = 20000 # 20 seconds (originally 4 mins but shortened for testing purposes)
SAMPLE_RATE = 16000
CHUNK_SIZE = int(SAMPLE_RATE / 10) # 100ms
# Environment Variable set for Google Credentials. Put the json service account
# key in the root directory
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'YOUR_SERVICE_ACCOUNT_KEY.json'
def get_current_time():
"""Return Current Time in MS."""
return int(round(time.time() * 1000))
class ResumableMicrophoneStream:
"""Opens a recording stream as a generator yielding the audio chunks."""
def __init__(self, rate, chunk_size):
self._rate = rate
self.chunk_size = chunk_size
self._num_channels = 1
self._buff = queue.Queue()
self.closed = True
self.start_time = get_current_time()
self.restart_counter = 0
self.audio_input = []
self.last_audio_input = []
self.result_end_time = 0
self.is_final_end_time = 0
self.final_request_end_time = 0
self.bridging_offset = 0
self.last_transcript_was_final = False
self.new_stream = True
self._audio_interface = pyaudio.PyAudio()
self._audio_stream = self._audio_interface.open(
format=pyaudio.paInt16,
channels=self._num_channels,
rate=self._rate,
input=True,
frames_per_buffer=self.chunk_size,
# Run the audio stream asynchronously to fill the buffer object.
# This is necessary so that the input device's buffer doesn't
# overflow while the calling thread makes network requests, etc.
stream_callback=self._fill_buffer,
)
def __enter__(self):
self.closed = False
return self
def __exit__(self, type, value, traceback):
self._audio_stream.stop_stream()
self._audio_stream.close()
self.closed = True
# Signal the generator to terminate so that the client's
# streaming_recognize method will not block the process termination.
self._buff.put(None)
self._audio_interface.terminate()
def _fill_buffer(self, in_data, *args, **kwargs):
"""Continuously collect data from the audio stream, into the buffer."""
self._buff.put(in_data)
return None, pyaudio.paContinue
def generator(self):
"""Stream Audio from microphone to API and to local buffer"""
while not self.closed:
data = []
"""
THE BELOW 'IF' STATEMENT IS WHERE THE ERROR IS LIKELY OCCURRING
This statement runs when the streaming limit is hit and a new request is made.
"""
if self.new_stream and self.last_audio_input:
chunk_time = STREAMING_LIMIT / len(self.last_audio_input)
if chunk_time != 0:
if self.bridging_offset < 0:
self.bridging_offset = 0
if self.bridging_offset > self.final_request_end_time:
self.bridging_offset = self.final_request_end_time
chunks_from_ms = round(
(self.final_request_end_time - self.bridging_offset)
/ chunk_time
)
self.bridging_offset = round(
(len(self.last_audio_input) - chunks_from_ms) * chunk_time
)
for i in range(chunks_from_ms, len(self.last_audio_input)):
data.append(self.last_audio_input[i])
self.new_stream = False
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk = self._buff.get()
self.audio_input.append(chunk)
if chunk is None:
return
data.append(chunk)
# Now consume whatever other data's still buffered.
while True:
try:
chunk = self._buff.get(block=False)
if chunk is None:
return
data.append(chunk)
self.audio_input.append(chunk)
except queue.Empty:
break
yield b"".join(data)
def listen_print_loop(responses, stream):
"""Iterates through server responses and prints them.
The responses passed is a generator that will block until a response
is provided by the server.
Each response may contain multiple results, and each result may contain
multiple alternatives; Here we print only the transcription for the top
alternative of the top result.
In this case, responses are provided for interim results as well. If the
response is an interim one, print a line feed at the end of it, to allow
the next result to overwrite it, until the response is a final one. For the
final one, print a newline to preserve the finalized transcription.
"""
for response in responses:
if get_current_time() - stream.start_time > STREAMING_LIMIT:
stream.start_time = get_current_time()
break
if not response.results:
continue
result = response.results[0]
if not result.alternatives:
continue
transcript = result.alternatives[0].transcript
result_seconds = 0
result_micros = 0
if result.result_end_time.seconds:
result_seconds = result.result_end_time.seconds
if result.result_end_time.microseconds:
result_micros = result.result_end_time.microseconds
stream.result_end_time = int((result_seconds * 1000) + (result_micros / 1000))
corrected_time = (
stream.result_end_time
- stream.bridging_offset
+ (STREAMING_LIMIT * stream.restart_counter)
)
# Display interim results, but with a carriage return at the end of the
# line, so subsequent lines will overwrite them.
if result.is_final:
sys.stdout.write("FINAL RESULT # ")
sys.stdout.write(str(corrected_time/1000) + ": " + transcript + "\n")
stream.is_final_end_time = stream.result_end_time
stream.last_transcript_was_final = True
# Exit recognition if any of the transcribed phrases could be
# one of our keywords.
if re.search(r"\b(exit|quit)\b", transcript, re.I):
sys.stdout.write("Exiting...\n")
stream.closed = True
break
else:
sys.stdout.write("INTERIM RESULT # ")
sys.stdout.write(str(corrected_time/1000) + ": " + transcript + "\r")
stream.last_transcript_was_final = False
def main():
"""start bidirectional streaming from microphone input to speech API"""
client = speech.SpeechClient()
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=SAMPLE_RATE,
language_code="en-US",
max_alternatives=1,
)
streaming_config = speech.StreamingRecognitionConfig(
config=config, interim_results=True
)
mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE)
print(mic_manager.chunk_size)
sys.stdout.write('\nListening, say "Quit" or "Exit" to stop.\n\n')
sys.stdout.write("End (ms) Transcript Results/Status\n")
sys.stdout.write("=====================================================\n")
with mic_manager as stream:
while not stream.closed:
sys.stdout.write(
"\n" + str(STREAMING_LIMIT * stream.restart_counter) + ": NEW REQUEST\n"
)
stream.audio_input = []
audio_generator = stream.generator()
requests = (
speech.StreamingRecognizeRequest(audio_content=content)
for content in audio_generator
)
responses = client.streaming_recognize(streaming_config, requests)
# Now, put the transcription responses to use.
listen_print_loop(responses, stream)
if stream.result_end_time > 0:
stream.final_request_end_time = stream.is_final_end_time
stream.result_end_time = 0
stream.last_audio_input = []
stream.last_audio_input = stream.audio_input
stream.audio_input = []
stream.restart_counter = stream.restart_counter + 1
if not stream.last_transcript_was_final:
sys.stdout.write("\n")
stream.new_stream = True
if __name__ == "__main__":
main()
# [END speech_transcribe_infinite_streaming]
My Current Diagnosis
The 'corrected_time' is not being set correctly when new requests are made. This is due to the 'bridging_offset' not being set correctly. So what we need to look at is the 'generator()' method in the 'ResumableMicrophoneStream' class.
In the 'generator()' method, there is an 'if' statement which is run when the streaming limit is hit and a new request is made
if self.new_stream and self.last_audio_input:
Its purpose appears to be to take any lingering audio data that wasn't finished being transcribed before the streaming limit was hit and add it to the buffer before any new audio chunks so that it's transcribed in the new request.
It is also the responsibility of this 'if' statement to set the 'bridging offset' but I'm not entirely sure what this offset represents. All I know is that however it is being set, it is not being set accurately.
Time offset values show the beginning and the end of each spoken word
that is recognized in the supplied audio. A time offset value
represents the amount of time that has elapsed from the beginning of
the audio, in increments of 100ms.
This tells us that the offset you are receiving for each of the timestamps that you are running within your project will always make the timestamps from start to finish. That would be my guess as to why it’s causing your application problems.

How to JSON dump to a rotating file object

I'm writing a program which periodically dumps old data from a RethinkDB database into a file and removes it from the database. Currently, the data is dumped into a single file which grows without limit. I'd like to change this so that the maximum file size is, say, 250 Mb, and the program starts to write to a new output file just before this size is exceeded.
It seems like Python's RotatingFileHandler class for loggers does approximately what I want; however, I'm not sure whether logging can be applied to any JSON-dumpable object or just to strings.
Another possible approach would be to use (a variant of) Mike Pennington's
RotatingFile class (see python: outfile to another text file if exceed certain file size).
Which of these approaches is likely to be the most fruitful?
For reference, my current program is as follows:
import os
import sys
import json
import rethinkdb as r
import pytz
from datetime import datetime, timedelta
import schedule
import time
import functools
from iclib import RethinkDB
import msgpack
''' The purpose of the Controller is to periodically archive data from the "sensor_data" table so that it does not grow without limit.'''
class Controller(RethinkDB):
def __init__(self, db_address=(os.environ['DB_ADDR'], int(os.environ['DB_PORT'])), db_name=os.environ['DB_NAME']):
super(Controller, self).__init__(db_address=db_address, db_name=db_name) # Initialize the IperCronComponent with the default logger name (in this case, "Controller")
self.db_table = RethinkDB.SENSOR_DATA_TABLE # The table name is "sensor_data" and is stored as a class variable in RethinkDBMixIn
def generate_archiving_query(self, retention_period=timedelta(days=3)):
expiry_time = r.now() - retention_period.total_seconds() # Timestamp before which data is to be archived
if "timestamp" in r.table(self.db_table).index_list().run(self.db): # If "timestamp" is a secondary index
beginning_of_time = r.time(1400, 1, 1, 'Z') # The minimum time of a ReQL time object (i.e., the year 1400 in the UTC timezone)
data_to_archive = r.table(self.db_table).between(beginning_of_time, expiry_time, index="timestamp") # Generate query using "between" (faster)
else:
data_to_archive = r.table(self.db_table).filter(r.row['timestamp'] < expiry_time) # Generate the same query using "filter" (slower, but does not require "timestamp" to be a secondary index)
return data_to_archive
def archiving_job(self, data_to_archive=None, output_file="archived_sensor_data.json"):
if data_to_archive is None:
data_to_archive = self.generate_archiving_query() # By default, the call the "generate_archiving_query" function to generate the query
old_data = data_to_archive.run(self.db, time_format="raw") # Without time_format="raw" the output does not dump to JSON
with open(output_file, 'a') as f:
ids_to_delete = []
for item in old_data:
print item
# msgpack.dump(item, f)
json.dump(item, f)
f.write('\n') # Separate each document by a new line
ids_to_delete.append(item['id'])
r.table(self.db_table).get_all(r.args(ids_to_delete)).delete().run(self.db) # Delete based on ID. It is preferred to delete the entire batch in a single operation rather than to delete them one by one in the for loop.
def test_job_1():
db_name = "ipercron"
table_name = "sensor_data"
port_offset = 1 # To avoid interference of this testing program with the main program, all ports are initialized at an offset of 1 from the default ports using "rethinkdb --port_offset 1" at the command line.
conn = r.connect("localhost", 28015 + port_offset)
r.db(db_name).table(table_name).delete().run(conn)
import rethinkdb_add_data
controller = Controller(db_address=("localhost", 28015+port_offset))
archiving_job = functools.partial(controller.archiving_job, data_to_archive=controller.generate_archiving_query())
return archiving_job
if __name__ == "__main__":
archiving_job = test_job_1()
schedule.every(0.1).minutes.do(archiving_job)
while True:
schedule.run_pending()
It is not completely 'runnable' from the part shown, but the key point is that I would like to replace the line
json.dump(item, f)
with a similar line in which f is a rotating, and not fixed, file object.
Following Stanislav Ivanov, I used json.dumps to convert each RethinkDB document to a string and wrote this to a RotatingFileHandler:
import os
import sys
import json
import rethinkdb as r
import pytz
from datetime import datetime, timedelta
import schedule
import time
import functools
from iclib import RethinkDB
import msgpack
import logging
from logging.handlers import RotatingFileHandler
from random_data_generator import RandomDataGenerator
''' The purpose of the Controller is to periodically archive data from the "sensor_data" table so that it does not grow without limit.'''
os.environ['DB_ADDR'] = 'localhost'
os.environ['DB_PORT'] = '28015'
os.environ['DB_NAME'] = 'ipercron'
class Controller(RethinkDB):
def __init__(self, db_address=None, db_name=None):
if db_address is None:
db_address = (os.environ['DB_ADDR'], int(os.environ['DB_PORT'])) # The default host ("rethinkdb") and port (28015) are stored as environment variables
if db_name is None:
db_name = os.environ['DB_NAME'] # The default database is "ipercron" and is stored as an environment variable
super(Controller, self).__init__(db_address=db_address, db_name=db_name) # Initialize the instance of the RethinkDB class. IperCronComponent will be initialized with its default logger name (in this case, "Controller")
self.db_name = db_name
self.db_table = RethinkDB.SENSOR_DATA_TABLE # The table name is "sensor_data" and is stored as a class variable of RethinkDBMixIn
self.table = r.db(self.db_name).table(self.db_table)
self.archiving_logger = logging.getLogger("archiving_logger")
self.archiving_logger.setLevel(logging.DEBUG)
self.archiving_handler = RotatingFileHandler("archived_sensor_data.log", maxBytes=2000, backupCount=10)
self.archiving_logger.addHandler(self.archiving_handler)
def generate_archiving_query(self, retention_period=timedelta(days=3)):
expiry_time = r.now() - retention_period.total_seconds() # Timestamp before which data is to be archived
if "timestamp" in self.table.index_list().run(self.db):
beginning_of_time = r.time(1400, 1, 1, 'Z') # The minimum time of a ReQL time object (namely, the year 1400 in UTC)
data_to_archive = self.table.between(beginning_of_time, expiry_time, index="timestamp") # Generate query using "between" (faster, requires "timestamp" to be a secondary index)
else:
data_to_archive = self.table.filter(r.row['timestamp'] < expiry_time) # Generate query using "filter" (slower, but does not require "timestamp" to be a secondary index)
return data_to_archive
def archiving_job(self, data_to_archive=None):
if data_to_archive is None:
data_to_archive = self.generate_archiving_query() # By default, the call the "generate_archiving_query" function to generate the query
old_data = data_to_archive.run(self.db, time_format="raw") # Without time_format="raw" the output does not dump to JSON or msgpack
ids_to_delete = []
for item in old_data:
print item
self.dump(item)
ids_to_delete.append(item['id'])
self.table.get_all(r.args(ids_to_delete)).delete().run(self.db) # Delete based on ID. It is preferred to delete the entire batch in a single operation rather than to delete them one by one in the for-loop.
def dump(self, item, mode='json'):
if mode == 'json':
dump_string = json.dumps(item)
elif mode == 'msgpack':
dump_string = msgpack.packb(item)
self.archiving_logger.debug(dump_string)
def populate_database(db_name, table_name, conn):
if db_name not in r.db_list().run(conn):
r.db_create(db_name).run(conn) # Create the database if it does not yet exist
if table_name not in r.db(db_name).table_list().run(conn):
r.db(db_name).table_create(table_name).run(conn) # Create the table if it does not yet exist
r.db(db_name).table(table_name).delete().run(conn) # Empty the table to start with a clean slate
# Generate random data with timestamps uniformly distributed over the past 6 days
random_data_time_interval = timedelta(days=6)
start_random_data = datetime.utcnow().replace(tzinfo=pytz.utc) - random_data_time_interval
random_generator = RandomDataGenerator(seed=0)
packets = random_generator.packets(N=100, start=start_random_data)
# print packets
print "Adding data to the database..."
r.db(db_name).table(table_name).insert(packets).run(conn)
if __name__ == "__main__":
db_name = "ipercron"
table_name = "sensor_data"
port_offset = 1 # To avoid interference of this testing program with the main program, all ports are initialized at an offset of 1 from the default ports using "rethinkdb --port_offset 1" at the command line.
host = "localhost"
port = 28015 + port_offset
conn = r.connect(host, port) # RethinkDB connection object
populate_database(db_name, table_name, conn)
# import rethinkdb_add_data
controller = Controller(db_address=(host, port))
archiving_job = functools.partial(controller.archiving_job, data_to_archive=controller.generate_archiving_query()) # This ensures that the query is only generated once. (This is sufficient since r.now() is re-evaluated every time a connection is made).
schedule.every(0.1).minutes.do(archiving_job)
while True:
schedule.run_pending()
In this context the RethinkDB class does little other than define the class variable SENSOR_DATA_TABLE and the RethinkDB connection, self.db = r.connect(self.address[0], self.address[1]). This is run together with a module for generating fake data, random_data_generator.py:
import random
import faker
from datetime import datetime, timedelta
import pytz
import rethinkdb as r
class RandomDataGenerator(object):
def __init__(self, seed=None):
self._seed = seed
self._random = random.Random()
self._random.seed(seed)
self.fake = faker.Faker()
self.fake.random.seed(seed)
def __getattr__(self, x):
return getattr(self._random, x)
def name(self):
return self.fake.name()
def datetime(self, start=None, end=None):
if start is None:
start = datetime(2000, 1, 1, tzinfo=pytz.utc) # Jan 1st 2000
if end is None:
end = datetime.utcnow().replace(tzinfo=pytz.utc)
if isinstance(end, datetime):
dt = end - start
elif isinstance(end, timedelta):
dt = end
assert isinstance(dt, timedelta)
random_dt = timedelta(microseconds=self._random.randrange(int(dt.total_seconds() * (10 ** 6))))
return start + random_dt
def packets(self, N=1, start=None, end=None):
return [{'name': self.name(), 'timestamp': self.datetime(start=start, end=end)} for _ in range(N)]
When I run controller it produces several rolled-over output logs, each at most 2 kB in size, as expected:

Categories

Resources