How would I get psycopg2 logging of query execution time?

How would I get psycopg2 logging of query execution time? - python

I'm trying to get the performance statistics on queries executed by psycopg2, but the documentation / examples still seem fuzzy and not as clear as it could be.
I've at least got debugging working through the logger.
What would I need to do to access the performance data for the query? I'm wanting to get the number for query execution time.
Is there a method I can access, or something else I need to initialize to output the query execution time?
Here's a pieced together extract of what I have so far:
import psycopg2
import psycopg2.extensions
from psycopg2.extras import LoggingConnection
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
# set higher up in script
db_settings = {
"user": user,
"password": password,
"host": host,
"database": dbname,
}
query_txt = "[query_txt_from file]"
conn = psycopg2.connect(connection_factory=LoggingConnection, **db_settings)
conn.initialize(logger)
cur = conn.cursor()
cur.execute(query_txt)
and I get
DEBUG:__main__: [the query executed]

Easy enough to set timestamp at start of execution and calculate duration at end. You'll need your own simple subclasses of LoggingConnection and LoggingCursor. See my example code.
This is based on source of MinTimeLoggingConnection you can find in psycopg2/extras.py source.
import time
import psycopg2
import psycopg2.extensions
from psycopg2.extras import LoggingConnection, LoggingCursor
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
# MyLoggingCursor simply sets self.timestamp at start of each query
class MyLoggingCursor(LoggingCursor):
def execute(self, query, vars=None):
self.timestamp = time.time()
return super(MyLoggingCursor, self).execute(query, vars)
def callproc(self, procname, vars=None):
self.timestamp = time.time()
return super(MyLoggingCursor, self).callproc(procname, vars)
# MyLogging Connection:
# a) calls MyLoggingCursor rather than the default
# b) adds resulting execution (+ transport) time via filter()
class MyLoggingConnection(LoggingConnection):
def filter(self, msg, curs):
return msg + " %d ms" % int((time.time() - curs.timestamp) * 1000)
def cursor(self, *args, **kwargs):
kwargs.setdefault('cursor_factory', MyLoggingCursor)
return LoggingConnection.cursor(self, *args, **kwargs)
db_settings = {
....
}
query_txt = "[query_text_from file]"
conn = psycopg2.connect(connection_factory=MyLoggingConnection, **db_settings)
conn.initialize(logger)
cur = conn.cursor()
cur.execute(query_text)
and you'll get:
DEBUG: __main__:[query] 3 ms
within your filter() you can change the formatting, or choose to not display, if less than some value.

Related

Mocking cursor.fetchone() returns None istead of returning a value in python

I wrote a function for my project which fetches data from MSSQL server using Pyodbc. The function works fine. When I write unittest cases using unittest and mock library and mocked the cursor.fetchone and returned a predefined value but while running the test case it returns None instead of returning the value.
Here are my code.
Store.py
import os
from datetime import date
from datetime import timedelta
import logging
logging.basicConfig(filename="source_monitor.log", format='%(name)s - %(levelname)s - %(asctime)s %(message)s', filemode='a')
logger=logging.getLogger()
class KMExtracter:
def __init__(self,metric_collected_date):
self.metric_collected_date = metric_collected_date
# argument conn is a db connection which will passed in seperate program
def get_metrics_by_technology(self, conn, technology):
try:
cursor = conn.cursor()
cursor.execute(
"SELECT COUNT(*) FROM URL_STORE WHERE Technology='{0}' firstExtractionDate BETWEEN '{1} 00:00:00' AND '{1} 23:59:59'".format(self.technology[technology],
self.metric_collected_date
))
count = cursor.fetchone()
return count[0]
except Exception as e:
logging.error("{0} at get_metrics_by_technology()".format(e))
test_store.py
class TestKM(unittest.TestCase):
def test_get_metrics_by_technology(self):
mock_data_interface = Mock()
mock_data_interface.cursor.return_value.execute.return_value.fetchone.return_value(23987,)
km = KMExtracter('2021-04-03')
print(km.get_metrics_by_technology(mock_data_interface, 'SOME'))
self.assertEqual(23987,km.get_metrics_by_technology(mock_data_interface, 'SOME'))
Error I got:
AssertionError: 23987 != None

class TestKM(unittest.TestCase):
def test_get_metrics_by_technology(self):
mock_data_interface = Mock()
# execute.return_value was removed from the below line.
mock_data_interface.cursor.return_value.fetchone.return_value(23987,)
km = KMExtracter('2021-04-03')
print(km.get_metrics_by_technology(mock_data_interface, 'SOME'))
self.assertEqual(23987,km.get_metrics_by_technology(mock_data_interface, 'SOME'))

How to get execution time of postgres cursor (python) [duplicate]

I'm trying to get the performance statistics on queries executed by psycopg2, but the documentation / examples still seem fuzzy and not as clear as it could be.
I've at least got debugging working through the logger.
What would I need to do to access the performance data for the query? I'm wanting to get the number for query execution time.
Is there a method I can access, or something else I need to initialize to output the query execution time?
Here's a pieced together extract of what I have so far:
import psycopg2
import psycopg2.extensions
from psycopg2.extras import LoggingConnection
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
# set higher up in script
db_settings = {
"user": user,
"password": password,
"host": host,
"database": dbname,
}
query_txt = "[query_txt_from file]"
conn = psycopg2.connect(connection_factory=LoggingConnection, **db_settings)
conn.initialize(logger)
cur = conn.cursor()
cur.execute(query_txt)
and I get
DEBUG:__main__: [the query executed]

Easy enough to set timestamp at start of execution and calculate duration at end. You'll need your own simple subclasses of LoggingConnection and LoggingCursor. See my example code.
This is based on source of MinTimeLoggingConnection you can find in psycopg2/extras.py source.
import time
import psycopg2
import psycopg2.extensions
from psycopg2.extras import LoggingConnection, LoggingCursor
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
# MyLoggingCursor simply sets self.timestamp at start of each query
class MyLoggingCursor(LoggingCursor):
def execute(self, query, vars=None):
self.timestamp = time.time()
return super(MyLoggingCursor, self).execute(query, vars)
def callproc(self, procname, vars=None):
self.timestamp = time.time()
return super(MyLoggingCursor, self).callproc(procname, vars)
# MyLogging Connection:
# a) calls MyLoggingCursor rather than the default
# b) adds resulting execution (+ transport) time via filter()
class MyLoggingConnection(LoggingConnection):
def filter(self, msg, curs):
return msg + " %d ms" % int((time.time() - curs.timestamp) * 1000)
def cursor(self, *args, **kwargs):
kwargs.setdefault('cursor_factory', MyLoggingCursor)
return LoggingConnection.cursor(self, *args, **kwargs)
db_settings = {
....
}
query_txt = "[query_text_from file]"
conn = psycopg2.connect(connection_factory=MyLoggingConnection, **db_settings)
conn.initialize(logger)
cur = conn.cursor()
cur.execute(query_text)
and you'll get:
DEBUG: __main__:[query] 3 ms
within your filter() you can change the formatting, or choose to not display, if less than some value.

How to read csv in parallel and write in Cassandra in parallel for achieving high throughput?

I have tried using execute, execute_async and execute_concurrent in Cassandra but for reading 10M rows, I could index them in Cassandra in no less than 55 mins. Note that I have had set the concurrent threads to 1000, tuned the YAML file's concurrent read and write limits as well (to 10000). I have tried replication factor 0, 1, 2 while creating the cluster. None could index the file in less time. So, I decided that instead of reading the csv sequentially, appending it in a list and then writing to Cassandra in batch, concurrent mode or in async mode, how about reading the CSV in parallel?! Hence, I used dask to read the csv file of 10M rows.
import json
import logging
from datetime import datetime
import dask.dataframe as dd
import dask.multiprocessing
import sys
import json
import pandas as pd
from cassandra import ConsistencyLevel, WriteTimeout
from cassandra.cluster import BatchStatement, Cluster
from cassandra.query import SimpleStatement
from cassandra.concurrent import execute_concurrent, execute_concurrent_with_args
class PythonCassandraExample:
def __init__(self, version):
self.cluster = None
self.session = None
self.keyspace = None
self.log = None
self.version = version
def __del__(self):
self.cluster.shutdown()
def createsession(self):
self.cluster = Cluster(['localhost'], connect_timeout=50)
self.session = self.cluster.connect(self.keyspace)
def getsession(self):
return self.session
# How about Adding some log info to see what went wrong
def setlogger(self):
log = logging.getLogger()
log.setLevel('INFO')
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(name)s: %(message)s"))
log.addHandler(handler)
self.log = log
# Create Keyspace based on Given Name
def handle_error(self, exception):
self.log.error("Failed to fetch user info: %s", exception)
def createkeyspace(self, keyspace):
"""
:param keyspace: The Name of Keyspace to be created
:return:
"""
# Before we create new lets check if exiting keyspace; we will drop that and create new
rows = self.session.execute(
"SELECT keyspace_name FROM system_schema.keyspaces")
if keyspace in [row[0] for row in rows]:
self.log.info("dropping existing keyspace...")
self.session.execute("DROP KEYSPACE " + keyspace)
self.log.info("creating keyspace...")
self.session.execute("""
CREATE KEYSPACE %s
WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': '3' }
""" % keyspace)
self.log.info("setting keyspace...")
self.session.set_keyspace(keyspace)
def create_table(self, table_name):
self.table_name = table_name
c_sql = "CREATE TABLE IF NOT EXISTS {} (id varchar, version varchar, row varchar, PRIMARY KEY(id, version));".format(
self.table_name)
print("Query for creating table is: {}".format(c_sql))
self.session.execute(c_sql)
self.log.info("DP Table Created !!!")
self.insert_sql = self.session.prepare(
(
"INSERT INTO {} ({}, {}, {}) VALUES (?,?,?)"
).format(
self.table_name, "id", "version", "row"
)
)
# lets do some batch insert
def insert_data(self, key, version, row):
self.session.execute(
self.insert_sql, [key, version, row]
)
#dask.delayed
def print_a_block(self, d):
d = d.to_dict(orient='records')
for row in d:
key = str(row["0"])
row = json.dumps(row, default=str)
self.insert_data(key, self.version, row)
if __name__ == '__main__':
start_time = datetime.utcnow()
example1 = PythonCassandraExample(version="version_1")
example1.createsession()
example1.setlogger()
example1.createkeyspace('fri_athena_two')
example1.create_table('TenMillion')
example1.log.info("Calling compute!")
df = dd.read_csv("/Users/aviralsrivastava/dev/levelsdb-learning/10gb.csv")
dask.compute(*[example1.print_a_block(d) for d in df.to_delayed()])
print(datetime.utcnow() - start_time)
Even using dask, all efforts were in vain as an hour passed by and yet, the task of writing rows into Cassandra was yet not completed? What else should I do in order to decrease the time taken?

Optimal way to fetch/publish data to/from cloudant in the Spark Streaming Env. - python/pySpark

I am looking to fetch and publish data from spark streaming onto cloudant. My code is as follows -
from CloudantPublisher import CloudantPublisher
from CloudantFetcher import CloudantFetcher
import pyspark
from pyspark.streaming.kafka import KafkaUtils
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from kafka import KafkaConsumer, KafkaProducer
import json
class SampleFramework():
def __init__(self):
pass
#staticmethod
def messageHandler(m):
return json.loads(m.message)
#staticmethod
def processData(rdd):
if (rdd.isEmpty()):
SampleFramework.logger.info("RDD is empty")
return
# Expand
expanded_rdd = rdd.mapPartitions(CloudantFetcher.fetch)
expanded_rdd.foreachPartition(CloudantPublisher.publish)
def run(self, ssc):
self.ssc = ssc
directKafkaStream = KafkaUtils.createDirectStream(self.ssc, SUBSCRIBE_QUEUE], \
{"metadata.broker.list": METADATA, \
"bootstrap.servers": BOOTSTRAP_SERVERS}, \
messageHandler= SampleFramework.messageHandler)
directKafkaStream.foreachRDD(SampleFramework.processData)
ssc.start()
ssc.awaitTermination()
Other supporting classes -
from CloudantConnector import CloudantConnector
class CloudantFetcher:
config = Config.createConfig()
cloudantConnector = CloudantConnector(config)
#staticmethod
def fetch(data):
final_data = []
for row in data:
id = row["id"]
if(not CloudantFetcher.cloudantConnector.isReady()):
CloudantFetcher.cloudantConnector.open()
data_json = CloudantFetcher.cloudantConnector.getOne({"id": id})
row["data"] = data_json
final_data.append(row)
CloudantFetcher.cloudantConnector.close()
return final_data
class CloudantPublisher:
config = Config.createConfig()
cloudantConnector = CloudantConnector(config)
#staticmethod
def publish(data):
CloudantPublisher.cloudantConnector.open()
CloudantPublisher.cloudantConnector.postAll(data)
CloudantPublisher.cloudantConnector.close()
from cloudant.client import Cloudant
from cloudant.result import Result
from cloudant.result import QueryResult
from cloudant.document import Document
from cloudant.query import Query
from cloudant.database import CloudantDatabase
import json
class CloudantConnector:
def __init__(self, config, db_name):
self.config = config
self.client = Cloudant(self.config["cloudant"]["credentials"]["username"], self.config["cloudant"]["credentials"]["password"], url=self.config["cloudant"]["host"]["full"])
self.initialized = False
self.db_name = self.config["cloudant"]["host"]["db_name"]
def open(self):
try:
self.client.connect()
self.logger.info("Connection to Cloudant established.")
self.initialized = True
except:
raise Exception("Could not connect to Cloudant! Please verify credentials.")
self.database = CloudantDatabase(self.client,self.db_name)
if self.database.exists():
pass
else:
self.database.create()
def isReady(self):
return self.initialized
def close(self):
self.client.disconnect()
def getOne(self, query):
new_filter = query
query = Query(self.database, selector = query, limit=1)
results_string = json.dumps(query.result[0][0])
results_json = json.loads(results_string)
return results_json
def postAll(self, docs):
documents = []
quantum = self.config["cloudant"]["constants"]["bulk_quantum"]
count = 0
for doc in docs:
document = Document(self.database)
document["id"] = doc["id"]
document["data"] = doc["data"]
documents.append(document)
count = count + 1
if(count%quantum==0):
self.database.bulk_docs(documents)
documents = []
if(len(documents)!=0):
self.database.bulk_docs(documents)
self.logger.debug("Uploaded document to the Cloudant database.")
My implementation works, but it's slow as compared to what I would expect in the case of not initializing the cloudant connection in each partition and maintaining a static source of these connection threads which can be passed on to each partition to use/ fetched by each partition to use.
My Questions are as follows:
Do I need to create a connection pool with cloudant 2.0 API in python? (It seems that it already exists within the API). If yes, then how should I go about it? The closest I have seen an implementation is this - link, but it's on an outdated cloudant api and does not work with the new one.
If the answer to the above is 'Yes', How can I make this accessible to the workers? I see references to creating serializable, lazily instantiated connection-client objects here. This would mean that I would make a lazily instantiated cloudant connection object in the SampleFramework. How can I do this in Python? Just like given in the spark documentation.
connection = ConnectionPool.getConnection()
for record in iter:
connection.send(record)
ConnectionPool.returnConnection(connection)
If the above is not possible, how do I speed up my operations? The only alternative I can think off is maintaining a single connection on the driver program, collecting the data from all workers and then fetching/uploading the same. This would decrease the number of times I need to connect to cloudant, but would take away the distributed fetching/publishing architecture.

How to JSON dump to a rotating file object

I'm writing a program which periodically dumps old data from a RethinkDB database into a file and removes it from the database. Currently, the data is dumped into a single file which grows without limit. I'd like to change this so that the maximum file size is, say, 250 Mb, and the program starts to write to a new output file just before this size is exceeded.
It seems like Python's RotatingFileHandler class for loggers does approximately what I want; however, I'm not sure whether logging can be applied to any JSON-dumpable object or just to strings.
Another possible approach would be to use (a variant of) Mike Pennington's
RotatingFile class (see python: outfile to another text file if exceed certain file size).
Which of these approaches is likely to be the most fruitful?
For reference, my current program is as follows:
import os
import sys
import json
import rethinkdb as r
import pytz
from datetime import datetime, timedelta
import schedule
import time
import functools
from iclib import RethinkDB
import msgpack
''' The purpose of the Controller is to periodically archive data from the "sensor_data" table so that it does not grow without limit.'''
class Controller(RethinkDB):
def __init__(self, db_address=(os.environ['DB_ADDR'], int(os.environ['DB_PORT'])), db_name=os.environ['DB_NAME']):
super(Controller, self).__init__(db_address=db_address, db_name=db_name) # Initialize the IperCronComponent with the default logger name (in this case, "Controller")
self.db_table = RethinkDB.SENSOR_DATA_TABLE # The table name is "sensor_data" and is stored as a class variable in RethinkDBMixIn
def generate_archiving_query(self, retention_period=timedelta(days=3)):
expiry_time = r.now() - retention_period.total_seconds() # Timestamp before which data is to be archived
if "timestamp" in r.table(self.db_table).index_list().run(self.db): # If "timestamp" is a secondary index
beginning_of_time = r.time(1400, 1, 1, 'Z') # The minimum time of a ReQL time object (i.e., the year 1400 in the UTC timezone)
data_to_archive = r.table(self.db_table).between(beginning_of_time, expiry_time, index="timestamp") # Generate query using "between" (faster)
else:
data_to_archive = r.table(self.db_table).filter(r.row['timestamp'] < expiry_time) # Generate the same query using "filter" (slower, but does not require "timestamp" to be a secondary index)
return data_to_archive
def archiving_job(self, data_to_archive=None, output_file="archived_sensor_data.json"):
if data_to_archive is None:
data_to_archive = self.generate_archiving_query() # By default, the call the "generate_archiving_query" function to generate the query
old_data = data_to_archive.run(self.db, time_format="raw") # Without time_format="raw" the output does not dump to JSON
with open(output_file, 'a') as f:
ids_to_delete = []
for item in old_data:
print item
# msgpack.dump(item, f)
json.dump(item, f)
f.write('\n') # Separate each document by a new line
ids_to_delete.append(item['id'])
r.table(self.db_table).get_all(r.args(ids_to_delete)).delete().run(self.db) # Delete based on ID. It is preferred to delete the entire batch in a single operation rather than to delete them one by one in the for loop.
def test_job_1():
db_name = "ipercron"
table_name = "sensor_data"
port_offset = 1 # To avoid interference of this testing program with the main program, all ports are initialized at an offset of 1 from the default ports using "rethinkdb --port_offset 1" at the command line.
conn = r.connect("localhost", 28015 + port_offset)
r.db(db_name).table(table_name).delete().run(conn)
import rethinkdb_add_data
controller = Controller(db_address=("localhost", 28015+port_offset))
archiving_job = functools.partial(controller.archiving_job, data_to_archive=controller.generate_archiving_query())
return archiving_job
if __name__ == "__main__":
archiving_job = test_job_1()
schedule.every(0.1).minutes.do(archiving_job)
while True:
schedule.run_pending()
It is not completely 'runnable' from the part shown, but the key point is that I would like to replace the line
json.dump(item, f)
with a similar line in which f is a rotating, and not fixed, file object.

Following Stanislav Ivanov, I used json.dumps to convert each RethinkDB document to a string and wrote this to a RotatingFileHandler:
import os
import sys
import json
import rethinkdb as r
import pytz
from datetime import datetime, timedelta
import schedule
import time
import functools
from iclib import RethinkDB
import msgpack
import logging
from logging.handlers import RotatingFileHandler
from random_data_generator import RandomDataGenerator
''' The purpose of the Controller is to periodically archive data from the "sensor_data" table so that it does not grow without limit.'''
os.environ['DB_ADDR'] = 'localhost'
os.environ['DB_PORT'] = '28015'
os.environ['DB_NAME'] = 'ipercron'
class Controller(RethinkDB):
def __init__(self, db_address=None, db_name=None):
if db_address is None:
db_address = (os.environ['DB_ADDR'], int(os.environ['DB_PORT'])) # The default host ("rethinkdb") and port (28015) are stored as environment variables
if db_name is None:
db_name = os.environ['DB_NAME'] # The default database is "ipercron" and is stored as an environment variable
super(Controller, self).__init__(db_address=db_address, db_name=db_name) # Initialize the instance of the RethinkDB class. IperCronComponent will be initialized with its default logger name (in this case, "Controller")
self.db_name = db_name
self.db_table = RethinkDB.SENSOR_DATA_TABLE # The table name is "sensor_data" and is stored as a class variable of RethinkDBMixIn
self.table = r.db(self.db_name).table(self.db_table)
self.archiving_logger = logging.getLogger("archiving_logger")
self.archiving_logger.setLevel(logging.DEBUG)
self.archiving_handler = RotatingFileHandler("archived_sensor_data.log", maxBytes=2000, backupCount=10)
self.archiving_logger.addHandler(self.archiving_handler)
def generate_archiving_query(self, retention_period=timedelta(days=3)):
expiry_time = r.now() - retention_period.total_seconds() # Timestamp before which data is to be archived
if "timestamp" in self.table.index_list().run(self.db):
beginning_of_time = r.time(1400, 1, 1, 'Z') # The minimum time of a ReQL time object (namely, the year 1400 in UTC)
data_to_archive = self.table.between(beginning_of_time, expiry_time, index="timestamp") # Generate query using "between" (faster, requires "timestamp" to be a secondary index)
else:
data_to_archive = self.table.filter(r.row['timestamp'] < expiry_time) # Generate query using "filter" (slower, but does not require "timestamp" to be a secondary index)
return data_to_archive
def archiving_job(self, data_to_archive=None):
if data_to_archive is None:
data_to_archive = self.generate_archiving_query() # By default, the call the "generate_archiving_query" function to generate the query
old_data = data_to_archive.run(self.db, time_format="raw") # Without time_format="raw" the output does not dump to JSON or msgpack
ids_to_delete = []
for item in old_data:
print item
self.dump(item)
ids_to_delete.append(item['id'])
self.table.get_all(r.args(ids_to_delete)).delete().run(self.db) # Delete based on ID. It is preferred to delete the entire batch in a single operation rather than to delete them one by one in the for-loop.
def dump(self, item, mode='json'):
if mode == 'json':
dump_string = json.dumps(item)
elif mode == 'msgpack':
dump_string = msgpack.packb(item)
self.archiving_logger.debug(dump_string)
def populate_database(db_name, table_name, conn):
if db_name not in r.db_list().run(conn):
r.db_create(db_name).run(conn) # Create the database if it does not yet exist
if table_name not in r.db(db_name).table_list().run(conn):
r.db(db_name).table_create(table_name).run(conn) # Create the table if it does not yet exist
r.db(db_name).table(table_name).delete().run(conn) # Empty the table to start with a clean slate
# Generate random data with timestamps uniformly distributed over the past 6 days
random_data_time_interval = timedelta(days=6)
start_random_data = datetime.utcnow().replace(tzinfo=pytz.utc) - random_data_time_interval
random_generator = RandomDataGenerator(seed=0)
packets = random_generator.packets(N=100, start=start_random_data)
# print packets
print "Adding data to the database..."
r.db(db_name).table(table_name).insert(packets).run(conn)
if __name__ == "__main__":
db_name = "ipercron"
table_name = "sensor_data"
port_offset = 1 # To avoid interference of this testing program with the main program, all ports are initialized at an offset of 1 from the default ports using "rethinkdb --port_offset 1" at the command line.
host = "localhost"
port = 28015 + port_offset
conn = r.connect(host, port) # RethinkDB connection object
populate_database(db_name, table_name, conn)
# import rethinkdb_add_data
controller = Controller(db_address=(host, port))
archiving_job = functools.partial(controller.archiving_job, data_to_archive=controller.generate_archiving_query()) # This ensures that the query is only generated once. (This is sufficient since r.now() is re-evaluated every time a connection is made).
schedule.every(0.1).minutes.do(archiving_job)
while True:
schedule.run_pending()
In this context the RethinkDB class does little other than define the class variable SENSOR_DATA_TABLE and the RethinkDB connection, self.db = r.connect(self.address[0], self.address[1]). This is run together with a module for generating fake data, random_data_generator.py:
import random
import faker
from datetime import datetime, timedelta
import pytz
import rethinkdb as r
class RandomDataGenerator(object):
def __init__(self, seed=None):
self._seed = seed
self._random = random.Random()
self._random.seed(seed)
self.fake = faker.Faker()
self.fake.random.seed(seed)
def __getattr__(self, x):
return getattr(self._random, x)
def name(self):
return self.fake.name()
def datetime(self, start=None, end=None):
if start is None:
start = datetime(2000, 1, 1, tzinfo=pytz.utc) # Jan 1st 2000
if end is None:
end = datetime.utcnow().replace(tzinfo=pytz.utc)
if isinstance(end, datetime):
dt = end - start
elif isinstance(end, timedelta):
dt = end
assert isinstance(dt, timedelta)
random_dt = timedelta(microseconds=self._random.randrange(int(dt.total_seconds() * (10 ** 6))))
return start + random_dt
def packets(self, N=1, start=None, end=None):
return [{'name': self.name(), 'timestamp': self.datetime(start=start, end=end)} for _ in range(N)]
When I run controller it produces several rolled-over output logs, each at most 2 kB in size, as expected:

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How would I get psycopg2 logging of query execution time? - python

Related

Mocking cursor.fetchone() returns None istead of returning a value in python

How to get execution time of postgres cursor (python) [duplicate]

How to read csv in parallel and write in Cassandra in parallel for achieving high throughput?

Optimal way to fetch/publish data to/from cloudant in the Spark Streaming Env. - python/pySpark

How to JSON dump to a rotating file object

Categories

Resources