I am trying to write a Python model which is capable of doing some processing in a PostgreSQL database using the multi-threading module and peewee.
In single core mode the code works, however, when I try to run the code with multiple cores I am running into a SSL error.
I would like to post the structure of my model in the hope that somebody can advice how to set of my model in a proper way. Currently, I have chosen to use an object oriented approach in which I make one connection which is shared in a pool. To clarify what I have done, I will now show the source code I have so far
I have three files: main.py, models.py and parser.py. The contents is the following
models.py defines the peewee postgresql table and makes a connection to the postgres server
import peewee as pw
from playhouse.pool import PooledPostgresqlExtDatabase
KVK_KEY = "id_number"
NAME_KEY = "name"
N_VOWELS_KEY = "n_vowels"
# initialise the data base
database = PooledPostgresqlExtDatabase(
"testdb", user="postgres", host="localhost", port=5432, password="xxxx",
max_connections=8, stale_timeout=300 )
class BaseModel(pw.Model):
class Meta:
database = database
only_save_dirty = True
# this class describes the format of the sql data base
class Company(BaseModel):
id_number = pw.IntegerField(primary_key=True)
name = pw.CharField(null=True)
n_vowels = pw.IntegerField(default=-1)
processor = pw.IntegerField(default=-1)
def connect_database(database_name, reset_database=False):
""" connect the database """
database.connect()
if reset_database:
database.drop_tables([Company])
database.create_tables([Company])
parser.py contains the CompanyParser class which is used as the engine of the code to do all the processing. It generates some artificial data which is stored to the postgresql database and then the run method is used to do some processing with the data already stored in the database
import pandas as pd
import numpy as np
import random
import string
import peewee as pw
from models import (Company, database, KVK_KEY, NAME_KEY)
import multiprocessing as mp
MAX_SQL_CHUNK = 1000
np.random.seed(0)
def random_name(size=8, chars=string.ascii_lowercase):
""" Create a random character string of 'size' characters """
return "".join(random.choice(chars) for _ in range(size))
def vowel_count(characters):
"""
Count the number of vowels in the string 'characters' and return as an integer
"""
count = 0
for char in characters:
if char in list("aeiou"):
count += 1
return count
class CompanyParser(mp.Process):
def __init__(self, number_of_companies=100, i_proc=None,
number_of_procs=1,
first_id=None, last_id=None):
if i_proc is not None and number_of_procs > 1:
mp.Process.__init__(self)
self.i_proc = i_proc
self.number_of_procs = number_of_procs
self.n_companies = number_of_companies
self.data_df: pd.DataFrame = None
self.first_id = first_id
self.last_id = last_id
def generate_data(self):
""" Create a dataframe with fake company data and id's """
id_list = np.random.randint(1000000, 9999999, self.n_companies)
company_list = np.array([random_name() for _ in range(self.n_companies)])
self.data_df = pd.DataFrame(data=np.vstack([id_list, company_list]).T,
columns=[KVK_KEY, NAME_KEY])
self.data_df.sort_values([KVK_KEY], inplace=True)
def store_to_database(self):
"""
Store the company data to a sql database
"""
record_list = list(self.data_df.to_dict(orient="index").values())
n_batch = int(len(record_list) / MAX_SQL_CHUNK) + 1
with database.atomic():
for cnt, batch in enumerate(pw.chunked(record_list, MAX_SQL_CHUNK)):
print(f"writing {cnt}/{n_batch}")
Company.insert_many(batch).execute()
def run(self):
print("Making query at {}".format(self.i_proc))
query = (Company.
select().
where(Company.id_number.between(self.first_id, self.last_id)))
print("Found {} companies".format(query.count()))
for cnt, company in enumerate(query):
print("Processing # {} - {}: company {}/{}".format(self.i_proc, cnt,
company.id_number,
company.name))
number_of_vowels = vowel_count(company.name)
company.n_vowels = number_of_vowels
company.processor = self.i_proc
print(f"storing number of vowels: {number_of_vowels}")
company.save()
Finally, my main script load the class stored in the models.py and parser.py and launches the code.
from models import (Company, connect_database)
from parser import CompanyParser
number_of_processors = 2
connect_database(None, reset_database=True)
# init an object of the CompanyParser and use the create database
parser = CompanyParser()
company_ids = Company.select(Company.id_number)
parser.generate_data()
parser.store_to_database()
n_companies = company_ids.count()
n_comp_per_proc = int(n_companies / number_of_processors)
print("Found {} companies: {} per proc".format(n_companies, n_comp_per_proc))
for i_proc in range(number_of_processors):
i_start = i_proc * n_comp_per_proc
first_id = company_ids[i_start]
last_id = company_ids[i_start + n_comp_per_proc - 1]
print(f"Running proc {i_proc} for id {first_id} until id {last_id}")
sub_parser = CompanyParser(first_id=first_id, last_id=last_id,
i_proc=i_proc,
number_of_procs=number_of_processors)
if number_of_processors > 1:
sub_parser.start()
else:
sub_parser.run()
In case that the number_of_processors = 1 this script works perfectly fine. It generates artificial data, stores it to the PostgreSQL database and does some processing on the data (it counts the number of vowels in the name and stores it to the n_vowels column)
However, in case I am trying to run this with 2 cores with number_of_processors = 2, I run into the following error
/opt/miniconda3/bin/python /home/eelco/PycharmProjects/multiproc_peewee/main.py
writing 0/1
Found 100 companies: 50 per proc
Running proc 0 for id 1020737 until id 5295565
Running proc 1 for id 5302405 until id 9891087
Making query at 0
Found 50 companies
Processing # 0 - 0: company 1020737/wqrbgxiu
storing number of vowels: 2
Making query at 1
Process CompanyParser-1:
Processing # 0 - 1: company 1086107/lkbagrbc
storing number of vowels: 1
Processing # 0 - 2: company 1298367/nsdjsqio
storing number of vowels: 2
Traceback (most recent call last):
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2714, in execute_sql
cursor.execute(sql, params or ())
psycopg2.OperationalError: SSL error: sslv3 alert bad record mac
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/miniconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File "/home/eelco/PycharmProjects/multiproc_peewee/parser.py", line 82, in run
company.save()
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 5748, in save
rows = self.update(**field_dict).where(self._pk_expr()).execute()
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1625, in inner
return method(self, database, *args, **kwargs)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1696, in execute
return self._execute(database)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2121, in _execute
cursor = database.execute(self)
File "/opt/miniconda3/lib/python3.7/site-packages/playhouse/postgres_ext.py", line 468, in execute
cursor = self.execute_sql(sql, params, commit=commit)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2721, in execute_sql
self.commit()
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2512, in __exit__
reraise(new_type, new_type(*exc_args), traceback)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 186, in reraise
raise value.with_traceback(tb)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2714, in execute_sql
cursor.execute(sql, params or ())
peewee.OperationalError: SSL error: sslv3 alert bad record mac
Process CompanyParser-2:
Traceback (most recent call last):
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2714, in execute_sql
cursor.execute(sql, params or ())
psycopg2.OperationalError: SSL error: decryption failed or bad record mac
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/miniconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File "/home/eelco/PycharmProjects/multiproc_peewee/parser.py", line 72, in run
print("Found {} companies".format(query.count()))
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1625, in inner
return method(self, database, *args, **kwargs)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1881, in count
return Select([clone], [fn.COUNT(SQL('1'))]).scalar(database)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1625, in inner
return method(self, database, *args, **kwargs)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1866, in scalar
row = self.tuples().peek(database)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1625, in inner
return method(self, database, *args, **kwargs)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1853, in peek
rows = self.execute(database)[:n]
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1625, in inner
return method(self, database, *args, **kwargs)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1696, in execute
return self._execute(database)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 1847, in _execute
cursor = database.execute(self)
File "/opt/miniconda3/lib/python3.7/site-packages/playhouse/postgres_ext.py", line 468, in execute
cursor = self.execute_sql(sql, params, commit=commit)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2721, in execute_sql
self.commit()
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2512, in __exit__
reraise(new_type, new_type(*exc_args), traceback)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 186, in reraise
raise value.with_traceback(tb)
File "/opt/miniconda3/lib/python3.7/site-packages/peewee.py", line 2714, in execute_sql
cursor.execute(sql, params or ())
peewee.OperationalError: SSL error: decryption failed or bad record mac
Process finished with exit code 0
Somehow something goes wrong as soon as the second thread start to do something with the database. Does somebody has advice to get this code working. I have tried the following already
Try the PooledPostgresDatabase and normal PostgresqlDatabase to
connect to the database. This leads to the same error
Try using sqlite in stead of postgres. This works for 2 cores, but only if the two processes are not interfering too much; otherwise I
can some locking problems. I was in the impression that postgres
would be better for doing multiprocessing then sqlite (is that true?)
When putting a break after launching the first process(so effectively using only one core), the code works, showing that the start method is called correctly.
Hopefully somebody can advise.
Regards
Eelco
After some searching on the internet today I found the solution for my problem here:github.com/coleifer. As coleifer mentions: you apparently first have to set up all the forks before you start connecting to the database. Based on this idea I have modified my code and it is working now.
For those interested I will post my python scripts again so you can see how I did it. This because I there is not so much explicit examples out there, so perhaps it may help others.
First of all, all the database and peewee modules are now moved into initialization functions which are only called inside the constructor of the CompanyParser class.
So models.py looks like
import peewee as pw
from playhouse.pool import PooledPostgresqlExtDatabase, PostgresqlDatabase, PooledPostgresqlDatabase
KVK_KEY = "id_number"
NAME_KEY = "name"
N_VOWELS_KEY = "n_vowels"
def init_database():
db = PooledPostgresqlDatabase(
"testdb", user="postgres", host="localhost", port=5432, password="xxxxx",
max_connections=8, stale_timeout=300)
return db
def init_models(db, reset_tables=False):
class BaseModel(pw.Model):
class Meta:
database = db
# this class describes the format of the sql data base
class Company(BaseModel):
id_number = pw.IntegerField(primary_key=True)
name = pw.CharField(null=True)
n_vowels = pw.IntegerField(default=-1)
processor = pw.IntegerField(default=-1)
if db.is_closed():
db.connect()
if reset_tables and Company.table_exists():
db.drop_tables([Company])
db.create_tables([Company])
return Company
Then, the worker class CompanyParser is defined in the parser.py script and looks like this
import multiprocessing as mp
import random
import string
import numpy as np
import pandas as pd
import peewee as pw
from models import (KVK_KEY, NAME_KEY, init_database, init_models)
MAX_SQL_CHUNK = 1000
np.random.seed(0)
def random_name(size=32, chars=string.ascii_lowercase):
""" Create a random character string of 'size' characters """
return "".join(random.choice(chars) for _ in range(size))
def vowel_count(characters):
"""
Count the number of vowels in the string 'characters' and return as an integer
"""
count = 0
for char in characters:
if char in list("aeiou"):
count += 1
return count
class CompanyParser(mp.Process):
def __init__(self, reset_tables=False,
number_of_companies=100, i_proc=None,
number_of_procs=1, first_id=None, last_id=None):
if i_proc is not None and number_of_procs > 1:
mp.Process.__init__(self)
self.i_proc = i_proc
self.reset_tables = reset_tables
self.number_of_procs = number_of_procs
self.n_companies = number_of_companies
self.data_df: pd.DataFrame = None
self.first_id = first_id
self.last_id = last_id
# initialise the database and models
self.database = init_database()
self.Company = init_models(self.database, reset_tables=self.reset_tables)
def generate_data(self):
""" Create a dataframe with fake company data and id's and return the array of id's"""
id_list = np.random.randint(1000000, 9999999, self.n_companies)
company_list = np.array([random_name() for _ in range(self.n_companies)])
self.data_df = pd.DataFrame(data=np.vstack([id_list, company_list]).T,
columns=[KVK_KEY, NAME_KEY])
self.data_df.drop_duplicates([KVK_KEY], inplace=True)
self.data_df.sort_values([KVK_KEY], inplace=True)
return self.data_df[KVK_KEY].values
def store_to_database(self):
"""
Store the company data to a sql database
"""
record_list = list(self.data_df.to_dict(orient="index").values())
n_batch = int(len(record_list) / MAX_SQL_CHUNK) + 1
with self.database.atomic():
for cnt, batch in enumerate(pw.chunked(record_list, MAX_SQL_CHUNK)):
print(f"writing {cnt}/{n_batch}")
self.Company.insert_many(batch).execute()
def run(self):
query = (self.Company.
select().
where(self.Company.id_number.between(self.first_id, self.last_id)))
for cnt, company in enumerate(query):
print("Processing # {} - {}: company {}/{}".format(self.i_proc, cnt, company.id_number,
company.name))
number_of_vowels = vowel_count(company.name)
company.n_vowels = number_of_vowels
company.processor = self.i_proc
try:
company.save()
except (pw.OperationalError, pw.InterfaceError) as err:
print("failed save for {} {}: {}".format(self.i_proc, cnt, err))
else:
pass
Finally, the main.py script which launches the processes:
from parser import CompanyParser
import time
def main():
number_of_processors = 2
number_of_companies = 10000
parser = CompanyParser(number_of_companies=number_of_companies, reset_tables=True)
company_ids = parser.generate_data()
parser.store_to_database()
n_companies = company_ids.size
n_comp_per_proc = int(n_companies / number_of_processors)
print("Found {} companies: {} per proc".format(n_companies, n_comp_per_proc))
if not parser.database.is_closed():
parser.database.close()
processes = list()
for i_proc in range(number_of_processors):
i_start = i_proc * n_comp_per_proc
first_id = company_ids[i_start]
last_id = company_ids[i_start + n_comp_per_proc - 1]
print(f"Running proc {i_proc} for id {first_id} until id {last_id}")
sub_parser = CompanyParser(first_id=first_id, last_id=last_id, i_proc=i_proc,
number_of_procs=number_of_processors)
if number_of_processors > 1:
sub_parser.start()
else:
sub_parser.run()
processes.append(sub_parser)
# this blocks the script until all processes are done
for job in processes:
job.join()
# make sure all the connections are closed
for i_proc in range(number_of_processors):
db = processes[i_proc].database
if not db.is_closed():
db.close()
print("Goodbye!")
if __name__ == "__main__":
start = time.time()
main()
duration = time.time() - start
print(f"Done in {duration} s")
As you can see, the database connection is done per process inside the class.
This example works and is a full example of multiprocessing + peewee and PostgreSQL. Hopefully this may help others. In case you have any comments or suggestions for improvement please let me know.
I did get this error too but with flask + peewee + rq in Heroku. Below is how I solved it:
If you have a simple app that you use with RQ, I would suggest to use SimpleWorker
RQ suggest to use rq.worker.HerokuWorker but I still received a ssl error with this.
The error appeared in a case where I have created a follow-up(chain) tasks, where execution of 1 depends on another tasks success.
Also I am using flask-rq2 but applies to normal usage as well as shown below:
# app.py
app = Flask(__name__)
app.config['RQ_WORKER_CLASS'] = os.getenv('RQ_WORKER_CLASS', 'rq.worker.Worker')
rq = RQ(app)
I solved it by changing the following in heroku config:
set your RQ_WORKER_CLASS to rq.worker.SimpleWorker
Related
The functionality of this script to shutdown GCP VM's based on some logic. Currently we are trying shut it down at night, but the scripts are failing before it could shutdown the VM. We shutdown based on time. At evenings and night the VM are shutdown using these scripts
Script to turn off instances overnight
Environment: Runs in Google Cloud Functions (Python3.7)
import datetime
import json
from pprint import pformat
import pytz
import re
import modules.common.cfcommon as cfcommon
import modules.utilities.dateutilities as dateutilities
from modules.compute.instances import InstanceList, Instance
from modules.compute.compute_service import ComputeServiceContext
from modules.utilities.printutilities import print_message, debug_message
from modules.pubsub.topic import PublishMessage
from modules.common.labels import VMAUTOSHUTDOWN_LABEL, VMAUTOSHUTDOWN_DEFER_LABEL, ShutdownDeferLabelValueIsValid, ShutdownLabelValueIsValid
from templates.renderer import render_template
# Takes a list in the following format and checks if the 'Instance' object is within it
# list must contain dictionaries in the following format:
# {"name": "instancename", "zone": "zonename"}
# Example: {"name": "test-01", "zone": "us-east4-c"}
#
# Parameters:
# inputList - list of dictionary objects
# instance - Instance object
def isInstanceInList(inputList, instance):
if not isinstance(inputList, list):
raise TypeError("Provided inputList is not a list")
if not isinstance(instance, Instance):
raise TypeError("Provided instance is not of type 'Instance'")
# Iterate over every item in inputList and check if the name and zone match
for cItem in inputList:
if cItem["name"].lower() == instance.properties["name"].lower() and cItem["zone"].lower() == instance.GetShortZoneName().lower():
return True
# No match found
return False
# Takes a list of Instance objects and sees if their shutdown timezone is within the graceperiod of the shutdownHour
#
# Example: is shutdown hour is 23 and the gracePeriodMin is 15 then if the function is called at 23:12, the instance will be included in the shutdown list
#
# Parameters:
# instanceList - List of Instance objects
# shutdownHour - number (0-23) 0 = Midnight, 23 = 11PM
# gracePeriodMin - number
def getInstancesToStop(instanceList, gracePeriodMin):
instancesToStop = []
debug_message("Entering getInstancesToStop")
for cInstance in instanceList:
debug_message("Instance: %s (ID: %s, Zone: %s, Project: %s)" % (cInstance.GetName(), cInstance.GetId(), cInstance.GetShortZoneName(), cInstance.project))
labels = cInstance.GetLabels()
if VMAUTOSHUTDOWN_LABEL in labels.keys():
labelValue = labels.get(VMAUTOSHUTDOWN_LABEL, '')
pattern = '\d\d-\d\d-\d\d'
match = re.match(pattern, labelValue)
if not match or not ShutdownLabelValueIsValid(labelValue):
debug_message(f'Label {labelValue} does not match the correct format')
instancesToStop.append(cInstance)
continue
else:
debug_message(f'Label {VMAUTOSHUTDOWN_LABEL} not found. Adding to shutdown list')
instancesToStop.append(cInstance)
continue
shutdown_deferred_utc_datetime = None
if VMAUTOSHUTDOWN_DEFER_LABEL in labels.keys():
labelValue = labels.get(VMAUTOSHUTDOWN_DEFER_LABEL, '')
pattern = '\d\d\d\d-\d\d-\d\dt\d\d-\d\d-\d\d'
match = re.match(pattern, labelValue)
if match and ShutdownDeferLabelValueIsValid(labelValue):
shutdown_deferred_utc_date, shutdown_deferred_utc_time = labelValue.split('t')
year, month, day = shutdown_deferred_utc_date.split('-')
hour, minute, second = shutdown_deferred_utc_time.split('-')
shutdown_deferred_utc_datetime = datetime.datetime.now(pytz.timezone('GMT')).replace(
year=int(year), month=int(month), day=int(day), hour=int(hour), minute=int(minute), second=int(second)
)
else:
debug_message(f'Label {labels[VMAUTOSHUTDOWN_DEFER_LABEL]} does not match the correct format')
instancesToStop.append(cInstance)
continue
current_utc_time = dateutilities.get_current_datetime()
# If defer date is in the future, and not in grace window time, skip shutting down
if shutdown_deferred_utc_datetime is not None and shutdown_deferred_utc_datetime > current_utc_time:
debug_message(f'Instance {cInstance.GetName()} shutdown deferred until after {labels[VMAUTOSHUTDOWN_DEFER_LABEL]}')
continue
# If defer time is in past, continue with the vm hour shutdown
shutdown_utc_hour = labels[VMAUTOSHUTDOWN_LABEL].split('-')[0]
# Convert shutdown UTC hour into datetime object
shutdown_utc_time = datetime.datetime.now(pytz.timezone('GMT')).replace(hour=int(shutdown_utc_hour), minute=0, second=0)
shutdown_utc_grace_time = shutdown_utc_time + datetime.timedelta(minutes=gracePeriodMin)
debug_message(f"Shutdown UTC time {shutdown_utc_time}")
debug_message(f"Shutdown UTC grace time {shutdown_utc_grace_time}")
# Check if shutdown is within time window
if current_utc_time >= shutdown_utc_time and current_utc_time <= shutdown_utc_grace_time:
debug_message("We're in the time window")
instancesToStop.append(cInstance)
else:
debug_message("We're outside the time window. Not adding to stop list")
return instancesToStop
# This is the main entry point that cloud functions calls
def AutoStopVMInstances(config, policy=None, payload=None, generate_local_report=False):
FUNCTION_NAME = "AutoStopVMInstances"
# Populated by config later...
QUERY_PROJECT_IDS = None # List of project IDs
INSTANCE_WHITELIST = None # List of dictionaries in format {"name": "instancename", "zone": "zonename", "project": "projectid"}
PREVIEW_MODE = True
SHUTDOWN_GRACEPERIOD_MIN = 30
# Start
startTime = datetime.datetime.now()
print_message("Started %s within Cloud Function %s [%s]" % (FUNCTION_NAME, cfcommon.CLOUD_FUNCTION_NAME, startTime))
debug_message("")
# For ease of access, assign from config values
debug_message("Processing Configuration...")
QUERY_PROJECT_IDS = config.get("QueryProjectIDs", []) # Required field
INSTANCE_WHITELIST = config.get("InstanceWhiteList", []) # Optional Field
PREVIEW_MODE = config.get("PreviewMode", True) # Required field
SHUTDOWN_GRACEPERIOD_MIN = config.get("ShutdownGracePeriodMin", None) # Required field
SKIP_INSTANCE_GROUPS = config.get("SkipInstanceGroups", False) # Optional
EMAIL_PUB_SUB_PROJECT = config.get("EmailPubSubProject", None) # Optional
EMAIL_PUB_SUB_TOPIC = config.get("EmailPubSubTopic", None) # Optional
EMAIL_TO = config.get("EmailTo", []) # Optional
EMAIL_CC = config.get("EmailCC", None) # Optional
EMAIL_BCC = config.get("EmailBCC", None) # Optional
EMAIL_FROM = config.get("EmailFrom", "noreply-ei-cs-cloudops-resource-administration#ei-cs-cloudops.local") # Optional
EMAIL_SUBJECT = config.get("EmailSubject", "Nightly VM Instance Shutdown Summary") # Optional
cfLogger = cfcommon.CloudFunctionLog()
# Validate whitelist
if INSTANCE_WHITELIST is None:
raise Exception("Unable to get whitelist")
debug_message("Whitelist loaded:")
debug_message(pformat((INSTANCE_WHITELIST)))
# Re-init Compute service - execution environment in cloud functions can be shared among each other. Let's re-init our connection every execution.
ComputeServiceContext.InitComputeService()
# Build the service object.
allRunningInstances = []
for cProjectId in QUERY_PROJECT_IDS:
debug_message("Checking Project: %s" % (cProjectId))
# Main Loop - Let's get and analyze all instances from our project
# Paginated within the 'request' object
runningInstances = []
allInstances = []
debug_message("Building Instance List...", end="")
instances = InstanceList(cProjectId)
instances.PopulateInstances()
debug_message("Done")
for cInstance in instances.GetAllInstances():
debug_message("Found Instance %s in %s [%s - %s]" % (cInstance.GetName(), cInstance.GetZone(), cInstance.GetId(), cInstance.GetStatus()))
# Check if whitelisted. If it is, skip it
if isInstanceInList(INSTANCE_WHITELIST, cInstance):
debug_message(" Instance is whitelisted. Skipping.")
continue
# Check if we should skip instance groups
if SKIP_INSTANCE_GROUPS and cInstance.IsWithinInstanceGroup():
debug_message(" Instance is within an instance group. Skipping.")
continue
debug_message(" Is Running: %s" % (cInstance.IsRunning()))
owner = cInstance.GetOwner()
if owner in ("devops", "ei devops", "eicsdevopseng"):
debug_message("Skipping instance owned by devops")
continue
# # TODO: FOR USE WHEN TESTING
# if VMAUTOSHUTDOWN_LABEL not in labels.keys():
# continue
# Keep track of this instance
allInstances.append(cInstance)
# If it's running, it's a candidate to stop
if cInstance.IsRunning():
runningInstances.append(cInstance)
# Handle no instances found
if len(allInstances) == 0:
debug_message("INFO: No Instances found.")
# Summarize for user
debug_message("")
if len(runningInstances) > 0:
debug_message("Found %s/%s non-whitelisted instances are running (project: %s)" % (len(runningInstances), len(allInstances), cProjectId))
else:
debug_message("All %s non-whitelisted instances are good (project: %s)" % (len(allInstances), cProjectId))
# Main loop to stop
debug_message("")
allRunningInstances = allRunningInstances + runningInstances
instancesToBeStopped = getInstancesToStop(allRunningInstances, SHUTDOWN_GRACEPERIOD_MIN)
stoppedCount = 0
instanceSummary = []
if len(instancesToBeStopped) == 0:
print_message("No instances are due to be stopped")
else:
for cInstance in instancesToBeStopped:
summaryEntry = {
"Name": cInstance.GetName(),
"ID": cInstance.GetId(),
"Zone": cInstance.GetShortZoneName(),
"Project": cInstance.GetProject(),
"Preview": PREVIEW_MODE,
"Stopped": False,
"InstanceLink": cInstance.GetSelfLinkToConsole()
}
logMessage = "Stopping Instance: {name} (ID: {id}, Zone: {zone}, Project: {project})".format(
name=summaryEntry.get("Name"),
id=summaryEntry.get("ID"),
zone=summaryEntry.get("Zone"),
project=summaryEntry.get("Project")
)
if PREVIEW_MODE:
print_message("(PREVIEW) " + logMessage )
else:
print_message(logMessage)
cInstance.Stop()
summaryEntry["Stopped"] = True
stoppedCount += 1
instanceSummary.append(summaryEntry)
if EMAIL_PUB_SUB_PROJECT is not None and EMAIL_PUB_SUB_TOPIC is not None:
debug_message("It looks like we have an email config. Attempting to send email")
emailBody = render_template(
'shutdown_report',
instance_summary=instanceSummary,
config=json.dumps(config, indent=4, sort_keys=True),
preview_mode=PREVIEW_MODE,
generation_time=datetime.datetime.now().astimezone(pytz.utc)
)
emailPayload = {
"To": EMAIL_TO,
"From": EMAIL_FROM,
"Subject": EMAIL_SUBJECT,
"BodyHtml": emailBody
}
if EMAIL_CC is not None:
emailPayload["CC"] = EMAIL_CC
if EMAIL_BCC is not None:
emailPayload["BCC"] = EMAIL_BCC
if not generate_local_report:
print_message("Sending email...", end="")
PublishMessage(EMAIL_PUB_SUB_PROJECT, EMAIL_PUB_SUB_TOPIC, json.dumps(emailPayload))
else:
print_message('Generating local HTML report')
with open('./html_reports/shutdown_report.html', 'w') as r:
r.write(emailBody)
print_message("Done")
# We want to log a nice structured json line to stackdriver for easy reporting.
cfLogger.log({
"StartTime": startTime.isoformat(),
"InstancesStoppedCount": stoppedCount,
"PreviewMode": PREVIEW_MODE,
"Instances": instanceSummary,
"Whitelist": INSTANCE_WHITELIST,
"EndTime": datetime.datetime.now().isoformat(),
"LogLine": "Summary"
})
print_message("DONE [%s]" % (datetime.datetime.now()))
The error I'm getting when trying to run a VM shutdown script:
Caught exception while running VMNightlyShutdown. Exception Text: Traceback (most recent call last):
File "/workspace/main.py", line 248, in StartCloudFunction
policy=policy_config)
File "/workspace/vm_nightly_shutdown.py", line 256, in AutoStopVMInstances
cInstance.Stop()
File "/workspace/modules/compute/instances.py", line 729, in Stop
stop_instance(self.project, self.GetShortZoneName(), self.GetName(), waitForCompletion=waitForCompletion)
File "/workspace/modules/compute/instances.py", line 45, in stop_instance
return wait_for_operation(project, zone, response["name"])
File "/workspace/modules/compute/instances.py", line 27, in wait_for_operation
operation=operation
File "/layers/google.python.pip/pip/lib/python3.7/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrapper
return wrapped(*args, **kwargs)
File "/layers/google.python.pip/pip/lib/python3.7/site-packages/googleapiclient/http.py", line 932, in execute
headers=self.headers,
File "/layers/google.python.pip/pip/lib/python3.7/site-packages/googleapiclient/http.py", line 222, in _retry_request
raise exception
File "/layers/google.python.pip/pip/lib/python3.7/site-packages/googleapiclient/http.py", line 191, in _retry_request
resp, content = http.request(uri, method, *args, **kwargs)
File "/layers/google.python.pip/pip/lib/python3.7/site-packages/google_auth_httplib2.py", line 225, in request
**kwargs
File "/layers/google.python.pip/pip/lib/python3.7/site-packages/httplib2/__init__.py", line 1721, in request
conn, authority, uri, request_uri, method, body, headers, redirections, cachekey,
File "/layers/google.python.pip/pip/lib/python3.7/site-packages/httplib2/__init__.py", line 1440, in _request
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
File "/layers/google.python.pip/pip/lib/python3.7/site-packages/httplib2/__init__.py", line 1392, in _conn_request
response = conn.getresponse()
File "/layers/google.python.runtime/python/lib/python3.7/http/client.py", line 1373, in getresponse
response.begin()
File "/layers/google.python.runtime/python/lib/python3.7/http/client.py", line 319, in begin
version, status, reason = self._read_status()
File "/layers/google.python.runtime/python/lib/python3.7/http/client.py", line 280, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/layers/google.python.runtime/python/lib/python3.7/socket.py", line 589, in readinto
return self._sock.recv_into(b)
File "/layers/google.python.runtime/python/lib/python3.7/ssl.py", line 1071, in recv_into
return self.read(nbytes, buffer)
File "/layers/google.python.runtime/python/lib/python3.7/ssl.py", line 929, in read
return self._sslobj.read(len, buffer)
ssl.SSLError: [SSL: DECRYPTION_FAILED_OR_BAD_RECORD_MAC] decryption failed or bad record mac (_ssl.c:2570)",
}
I am trying to crawl abstracts from PubMed and filtering them using regex via python. To speed things up, I wanted to use pythons multiprocessing pool.
My code looks like the following:
import multiprocessing as mp
from functools import partial
from typing import List, Tuple
def beautify_abstract(abstract: str, regex: str):
import re
result: str = ""
last_start = 0
matches = re.finditer(regex, abstract, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
result += abstract[last_start:match.start()]
result += "<b>"
result += abstract[match.start():match.end()]
result += "</b>"
last_start = match.end()
result += abstract[last_start:]
return result
def get_doi(pim: str, regex: str):
from Bio import Entrez
from Bio.Entrez import efetch
import re
from metapub.convert import pmid2doi
Entrez.email = "Your.Name.Here#example.org"
print(f"Processing {pim}")
abstract_handle = efetch(db="pubmed", id=pim, retmode='text', rettype='all')
abstract = abstract_handle.read()
abstract_handle.close()
if re.search(regex, abstract, re.MULTILINE) is not None:
docsum_handle = efetch(db="pubmed", id=pim, retmode='text', rettype='docsum').read()
docsum = docsum_handle.read()
try:
doi = pmid2doi(pim)
except:
doi = "UNKNOWN"
return f"{doi}"
return ""
def get_pim_with_regex_list(keywords: List[str]) -> List[str]:
from Bio import Entrez
Entrez.email = "Your.Name.Here#example.org"
searchterm = " ".join(keywords)
pims = []
handle = Entrez.esearch(db="pubmed", retstart=0, retmax=0, term=searchterm, idtype="acc")
record = Entrez.read(handle)
handle.close()
count = int(record['Count'])
if count > 100000:
retmax = 100000
else:
retmax = count
retstart = 0
while retstart < count:
handle = Entrez.esearch(db="pubmed", retstart=retstart, retmax=retmax, term=searchterm, idtype="acc")
record = Entrez.read(handle)
handle.close()
for pim in record['IdList']:
pims.append(pim)
retstart += retmax
return pims
if __name__ == '__main__':
keywords = ["keyword1", "keyword2"]
pim_list = get_pim_with_regex_list(keywords)
regex = "keyword1 keyword2"
worker_fn = partial(get_doi, regex=regex)
pool = mp.Pool(mp.cpu_count())
entries = pool.map(worker_fn, pim_list)
pool.close()
pool.join()
When I run the given code, I get the following error:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.9/multiprocessing/pool.py", line 114, in worker
task = get()
File "/usr/lib/python3.9/multiprocessing/queues.py", line 368, in get
return _ForkingPickler.loads(res)
TypeError: __new__() missing 2 required positional arguments: 'tag' and 'attributes'
Process ForkPoolWorker-4:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.9/multiprocessing/pool.py", line 114, in worker
task = get()
File "/usr/lib/python3.9/multiprocessing/queues.py", line 368, in get
return _ForkingPickler.loads(res)
TypeError: __new__() missing 2 required positional arguments: 'tag' and 'attributes'
I did some digging into multiprocessing with python and found out that only python native types are supported as parameters (enforced by the ForkingPickler).
Assuming that str is a native type, the code should work... Currently, I am completely lost and have no idea what may be the problem.
As suggested, I uploaded a minimal (sequential) working example here
Is there any way to fix this problem or at least diagnose the real issue here?
I'm working on automating some query extraction using python and pyodbc, and then converting to parquet format, and send to AWS S3.
My script solution is working fine so far, but I have faced a problem. I have a Schema, let us call it SCHEMA_A, and inside of it several tables, TABLE_1, TABLE_2 .... TABLE_N.
All those tables inside that schema are accessible by using the same credentials.
So I'm using a script like this one to automate the task.
def get_stream(cursor, batch_size=100000):
while True:
row = cursor.fetchmany(batch_size)
if row is None or not row:
break
yield row
cnxn = pyodbc.connect(driver='pyodbc driver here',
host='host name',
database='schema name',
user='user name,
password='password')
print('Connection stabilished ...')
cursor = cnxn.cursor()
print('Initializing cursos ...')
if len(sys.argv) > 1:
table_name = sys.argv[1]
cursor.execute('SELECT * FROM {}'.format(table_name))
else:
exit()
print('Query fetched ...')
row_batch = get_stream(cursor)
print('Getting Iterator ...')
cols = cursor.description
cols = [col[0] for col in cols]
print('Initalizin batch data frame ..')
df = pd.DataFrame(columns=cols)
start_time = time.time()
for rows in row_batch:
tmp = pd.DataFrame.from_records(rows, columns=cols)
df = df.append(tmp, ignore_index=True)
tmp = None
print("--- Batch inserted inn%s seconds ---" % (time.time() - start_time))
start_time = time.time()
I run a code similar to that inside Airflow tasks, and works just fine for all other tables. But then I have two tables, lets call TABLE_I and TABLE_II that yields the following error when I execute cursor.fetchmany(batch_size):
ERROR - ('ODBC SQL type -151 is not yet supported. column-index=16 type=-151', 'HY106')
Traceback (most recent call last):
File "/home/ubuntu/.local/lib/python3.8/site-packages/airflow/models/taskinstance.py", line 1112, in _run_raw_task
self._prepare_and_execute_task_with_callbacks(context, task)
File "/home/ubuntu/.local/lib/python3.8/site-packages/airflow/models/taskinstance.py", line 1285, in _prepare_and_execute_task_with_callbacks
result = self._execute_task(context, task_copy)
File "/home/ubuntu/.local/lib/python3.8/site-packages/airflow/models/taskinstance.py", line 1310, in _execute_task
result = task_copy.execute(context=context)
File "/home/ubuntu/.local/lib/python3.8/site-packages/airflow/operators/python.py", line 117, in execute
return_value = self.execute_callable()
File "/home/ubuntu/.local/lib/python3.8/site-packages/airflow/operators/python.py", line 128, in execute_callable
return self.python_callable(*self.op_args, **self.op_kwargs)
File "/home/ubuntu/prea-ninja-airflow/jobs/plugins/extract/fetch.py", line 58, in fetch_data
for rows in row_batch:
File "/home/ubuntu/prea-ninja-airflow/jobs/plugins/extract/fetch.py", line 27, in stream
row = cursor.fetchmany(batch_size)
Inspecting those tables with SQLElectron, and Querying the first few lines, I have realized that both TABLE_I and TABLE_II have a Column called 'Geolocalizacao', when I use SQL server language to find the DATA TYPE of that column with:
SELECT DATA_TYPE
FROM INFORMATION_SCHEMA.COLUMNS
WHERE
TABLE_NAME = 'TABLE_I' AND
COLUMN_NAME = 'Geolocalizacao';
It yields:
DATA_TYPE
geography
Seraching here on stack overflow I found this solution: python pyodbc SQL Server Native Client 11.0 cannot return geometry column
By the description of the user, it seem work fine by adding:
def unpack_geometry(raw_bytes):
# adapted from SSCLRT information at
# https://learn.microsoft.com/en-us/openspecs/sql_server_protocols/ms-ssclrt/dc988cb6-4812-4ec6-91cd-cce329f6ecda
tup = struct.unpack('<i2b3d', raw_bytes)
# tup contains: (unknown, Version, Serialization_Properties, X, Y, SRID)
return tup[3], tup[4], tup[5]
and then:
cnxn.add_output_converter(-151, unpack_geometry)
After creating the connection. But It's not working for the GEOGRAPHY DATA TYPE, when I use this code (add import struct on python script), it gives me the following error:
Traceback (most recent call last):
File "benchmark.py", line 79, in <module>
for rows in row_batch:
File "benchmark.py", line 39, in get_stream
row = cursor.fetchmany(batch_size)
File "benchmark.py", line 47, in unpack_geometry
tup = struct.unpack('<i2b3d', raw_bytes)
struct.error: unpack requires a buffer of 30 bytes
An example of values that this column have, follows the given template:
{"srid":4326,"version":1,"points":[{}],"figures":[{"attribute":1,"pointOffset":0}],"shapes":[{"parentOffset":-1,"figureOffset":0,"type":1}],"segments":[]}
I honestly don't know how to adapt the code for this given structure, can someone help me? It's been working fine for all other tables, but I have those two tables with this column that are giving me a lot o headeach.
Hi this is what I have done:
from binascii import hexlify
def _handle_geometry(geometry_value):
return f"0x{hexlify(geometry_value).decode().upper()}"
and then on connection:
cnxn.add_output_converter(-151, _handle_geometry)
this will return value as SSMS.
I have a couple of tables that I want to run my python functions through; however I am having a hard time writing my script to do that.
Here is my variable with the tables:
object_to_export = ['Opportunity','Asset']
Here is my code:
object_to_export = ['Opportunity','Asset']
def sf_login():
"""
Purpose: This function is designed to validate your Salesforce credentials and create/return an instance or session.
Note: If we have already signed in, this will just return the original object.
Note: The credentials are being held inside of the config.py file.
"""
session_id, instance = SalesforceLogin(username=config.username,
password=config.password,
security_token=config.security_token)
sf = Salesforce(instance=instance,
session_id=session_id)
return sf
def sf_object_extract():
"""
Purpose: This function is designed to extract data from Salesforce objects and return the results.
Note: For sample sake, I have set the limit to 1000 records per object.
"""
dd = sf_login()
set_limit = 1000
for sf_obj in object_to_export:
desc = dd.sf_obj.describe()
field_names = [field['name'] for field in desc['fields']]
soql = "SELECT {} FROM {} limit {}".format(','.join(field_names), sf_obj, set_limit)
results = dd.query_all(soql)
return results
sf_object_extract()
When I run my code, is receive this error:
Traceback (most recent call last):
File "/Users/m/PycharmProjects/test/sf_etl_hw_aaa_v1.py", line 43, in <module>
sf_object_extract()
File "/Users/m/PycharmProjects/test/sf_etl_hw_aaa_v1.py", line 37, in sf_object_extract
desc = dd.sf_obj.describe()
File "/Users/m/venv/lib/python3.7/site-packages/simple_salesforce/api.py", line 565, in describe
headers=headers
File "/Users/m/venv/lib/python3.7/site-packages/simple_salesforce/api.py", line 771, in _call_salesforce
exception_handler(result, self.name)
File "/Users/m/venv/lib/python3.7/site-packages/simple_salesforce/util.py", line 68, in exception_handler
raise exc_cls(result.url, result.status_code, name, response_content)
simple_salesforce.exceptions.SalesforceResourceNotFound: Resource sf_obj Not Found. Response content: [{'errorCode': 'NOT_FOUND', 'message': 'The requested resource does not exist'}]
Any idea as to why I am receiving this error and fix my script to pass the table parameters my desc and soql variables?
Addition: I also tried putting the desc variable as
desc = int("dd.{}.describe()".format(sf_obj))
But it gives me this error:
desc = int("dd.{}.describe()".format(sf_obj))
ValueError: invalid literal for int() with base 10: 'dd.Opportunity.describe()'
object_to_export = ['Opportunity','Asset']
def sf_login():
"""
Purpose: This function is designed to validate your Salesforce credentials and
create/return an instance or session.
Note: If we have already signed in, this will just return the original object.
Note: The credentials are being held inside of the config.py file.
"""
session_id, instance = SalesforceLogin(username=config.username,
password=config.password,
security_token=config.security_token)
sf = Salesforce(instance=instance,
session_id=session_id)
return sf
def sf_object_extract():
"""
Purpose: This function is designed to extract data from Salesforce objects and
return the results.
Note: For sample sake, I have set the limit to 1000 records per object.
"""
dd = sf_login()
set_limit = 1000
for sf_obj in object_to_export:
desc = getattr(dd, sf_obj).describe() #dd.sf_obj.describe()
field_names = [field['name'] for field in desc['fields']]
soql = "SELECT {} FROM {} limit {}".format(','.join(field_names), sf_obj,
set_limit)
results = dd.query_all(soql)
return results
sf_object_extract()
you need to use getattr(obj, name_of_member) see the line with comment
and example:
class d():
def __init__(self):
self.x = 0
q = d()
#to get the value of 'x' member of q you can do 2 things
x_value = getattr(q, 'x')
x_value = q.x
I'm trying to get my emails to work from commandline with mutt. I have been trying to follow these two guides: http://blog.developwithpassion.com/2013/05/02/getting-up-and-running-with-a-sane-mutt-setup/ and http://stevelosh.com/blog/2012/10/the-homely-mutt/#configuring-offlineimap
To do this there are 4 main steps:
1. setup offlineimap to download and keep synced your emails
2. setup mutt (the email user interface)
3. setup notmuch (to be able to search your emails)
4. setup msmtp (to be able to send emails)
Note that I am using Macbook Pro running the OS X 10.9.2. I am stuck at step 1 because I am getting an error with offlineimap! I am able to run offlineimap for a long time, i.e. it will sync all of the emails (36197 of them!) and then right at the end it spits out the following error:
ERROR: getfolder() asked for a nonexisting folder 'Drafts'.
Folder Deleted Items [acc: gloriphobia]:
Establishing connection to imap.gmail.com:993
Account sync gloriphobia:
*** Finished account 'gloriphobia' in 137:16
ERROR: Exceptions occurred during the run!
ERROR: command: UID => socket error: <class 'socket.error'> - [Errno 54] Connection reset by peer
Traceback:
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/folder/IMAP.py", line 219, in getmessage
'(BODY.PEEK[])')
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/imaplib2.py", line 1167, in uid
return self._simple_command('UID', command, *args, **kw)
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/imaplib2.py", line 1615, in _simple_command
return self._command_complete(self._command(name, *args), kw)
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/imaplib2.py", line 1378, in _command_complete
typ, dat = rqb.get_response('command: %s => %%s' % rqb.name)
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/imaplib2.py", line 176, in get_response
raise typ(exc_fmt % str(val))
ERROR: getfolder() asked for a nonexisting folder 'Drafts'.
Traceback:
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/accounts.py", line 241, in syncrunner
self.sync()
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/accounts.py", line 320, in sync
localfolder = self.get_local_folder(remotefolder)
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/accounts.py", line 269, in get_local_folder
replace(self.remoterepos.getsep(), self.localrepos.getsep()))
File "/usr/local/Cellar/offline-imap/6.5.5/libexec/offlineimap/repository/Maildir.py", line 134, in getfolder
OfflineImapError.ERROR.FOLDER)
My .offlineimaprc is:
[general]
accounts = gloriphobia
ui = TTYUI
pythonfile=~/Development/MuttMailPython/offline.py
fsync = False
[Account gloriphobia]
localrepository = gloriphobia_local
remoterepository = gloriphobia_remote
status_backend = sqlite
postsynchook = notmuch new
[Repository gloriphobia_local]
type = Maildir
localfolders = ~/.mail/Test
nametrans = get_remote_name
[Repository gloriphobia_remote]
maxconnections = 1
type = Gmail
cert_fingerprint = 89091347184d41768bfc0da9fad94bfe882dd358
remoteuser = myemailaddress
remotepasseval = get_keychain_pass(account="myemailaddress",server="imap.gmail.com")
realdelete = no
nametrans = get_local_name
folderfilter = is_included
My python file, the one that is called offline.py is:
#!/usr/bin/python
import subprocess
import re
class NameMapping:
def __init__(self, local_name, remote_name):
self.local_name = local_name
self.remote_name = remote_name
class LocalName:
def __init__(self, folder):
self.folder = folder
def matches(self, mapping):
return mapping.remote_name == self.folder
def mapped_folder_name(self, mapping):
return mapping.local_name
class RemoteName:
def __init__(self, folder):
self.folder = folder
def matches(self, mapping):
return mapping.local_name == self.folder
def mapped_folder_name(self, mapping):
return mapping.remote_name
def get_keychain_pass(account=None, server=None):
params = {
'security': '/usr/bin/security',
'command': 'find-internet-password',
'account': account,
'server': server,
'keychain': '/Users/mec07/Library/Keychains/login.keychain',
}
command = "sudo -u mec07 %(security)s -v %(command)s -g -a %(account)s -s %(server)s %(keychain)s" % params
output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
outtext = [l for l in output.splitlines()
if l.startswith('password: ')][0]
return re.match(r'password: "(.*)"', outtext).group(1)
def is_included(folder):
result = True
for pattern in exclusion_patterns:
result = result and (re.search(pattern, folder) == None)
return result
exclusion_patterns = [
"efax",
"earth_class_mail",
"eventbrite",
"gotomeeting",
"moshi_monsters",
"peepcode",
"raini_fowl",
"stuart_know",
"training.*2008",
"training.*2009",
"training.*2010",
"training.*2011",
"training.*2012",
"training.*nbdn",
"training.*nothin_but_bdd",
"unblock_us",
"web_hosting",
"webinars",
"Gmail.*Important"
]
name_mappings = [
NameMapping('inbox', '[Gmail]/Inbox'),
NameMapping('starred', '[Gmail]/Starred'),
NameMapping('important', '[Gmail]/Important'),
NameMapping('sent', '[Gmail]/Sent Mail'),
NameMapping('drafts', '[Gmail]/Drafts'),
NameMapping('archive', '[Gmail]/All Mail'),
NameMapping('spam', '[Gmail]/Spam'),
NameMapping('flagged', '[Gmail]/Starred'),
NameMapping('trash', '[Gmail]/Trash'),
NameMapping('deleted', '[Gmail]/Deleted Items'),
NameMapping('Mum', '[Gmail]/Jana'),
NameMapping('Maggie', '[Gmail]/Maggie'),
NameMapping('papers', '[Gmail]/Scholar Alert'),
NameMapping('sent items', '[Gmail]/Sent Items'),
NameMapping('sent messages', '[Gmail]/Sent Messages')
]
def find_name_mapping(name):
default_mapping = NameMapping(name.folder, name.folder)
for mapping in name_mappings:
if (name.matches(mapping)):
return mapping
return default_mapping
def get_name_mapping(name):
mapping = find_name_mapping(name)
return name.mapped_folder_name(mapping)
def get_remote_name(local_folder_name):
name = RemoteName(local_folder_name)
return get_name_mapping(name)
def get_local_name(remote_folder_name):
name = LocalName(remote_folder_name)
return get_name_mapping(name)
Thanks in advance for your help!
Add:
folderfilter = lambda folder: folder not in ['Drafts,]