Class architecture to represent a single query with connection and processing attribures - python

I need to build a piece of software that is able to pull data from different data sources, each data source is represented by a (server, db_name) and for each data sources I need to run a specific query (parametrized by 1,2 fields).
This is the basic structure I came up with, I am not convinced that this is the best way to implement this, especially the get_data() function inside the base class.
My question is then, do you think there's a better way of implementing this logic ?
class BrandedCube(Cube):
def __init__(self, measure:str="ALL", cache_path=""):
super().__init__(measure=measure)
self.server = 'SERVER_NAME'
self.db_name = 'DB_NAME'
self.source_name = ""
self.account_name_hierarchy = ["Account Name"]
self.cache_path = cache_path
self.query = """
EVALUATE
SUMMARIZECOLUMNS(
# STATIC LIST OF COLUMNS,
# STATIC LIST OF FILTERS,
# SOME PARAMETRIC FILTERS
KEEPFILTERS( TREATAS( {{"{qtr}"}}, 'Calendar'[Quarter] )),
"{measure}", [{measure}]
)
"""
def preprocess(self, df:pd.DataFrame) -> pd.DataFrame:
# Eaxample of processing steps
df.rename(columns = {
'Calendar[qtr_sort]': 'Quarter'},
inplace=True)
for col in self.measure_list:
df[col] = df[col].astype(str).astype(float).fillna(0)
df.reset_index(drop=True, inplace=True)
return df
class Cube:
def __init__(self, measure:str):
self.server = ""
self.db_name = ""
# Query to pull data
self.query = """"""
# Path to save output data
self.cache_path = ""
# Quarter format string
self.quarter_format = self.generate_std_format()
# Use or not the MAAD library to pull data
self.maad_library = False
self.maad_library_dbtype = "ssas" # ssas, sql
self.measure_list = self.generate_measure_list(measure)
self.measure = measure
self.id_col: str = ""
self.kpi_dict = {}
self.kpi_list = []
def get_data(self, qtr:str, measure:str) -> pd.DataFrame:
""" Query the cube with the query stored in the class.
The query is formatted with the quarter parameter.
1. maad_library = True : Use the maad library to pull the data -> require a MDX query string
2. maad_library = False: Use Luis library to pull data -> require DAX query
"""
if self.maad_library:
mdx_query = self.query.format(qtr=qtr, measure=measure)
df = maad.get_data(mdx_query, db=self.db_name, dbtype=self.maad_library_dbtype)
else:
format_query = self.query.format(qtr=qtr, measure=measure)
conn = create_connection(self.server, self.db_name)
df = ssas_api.get_DAX(conn, format_query)
return df

Related

What is the best way to return a variable or call a function to maximize code reuse?

I was wondering if i could get some input from some season python exports, i have a couple questions
I am extracting data from an api request and calculating the total vulnerabilities,
what is the best way i can return this data so that i can call it in another function
what is the way i can add up all the vulnerabilities (right now its just adding it per 500 at a time, id like to do the sum of every vulnerability
def _request():
third_party_patching_filer = {
"asset": "asset.agentKey IS NOT NULL",
"vulnerability" : "vulnerability.categories NOT IN ['microsoft patch']"}
headers = _headers()
print(headers)
url1 = f"https://us.api.insight.rapid7.com/vm/v4/integration/assets"
resp = requests.post(url=url1, headers=headers, json=third_party_patching_filer, verify=False).json()
jsonData = resp
#print(jsonData)
has_next_cursor = False
nextKey = ""
if "cursor" in jsonData["metadata"]:
has_next_cursor = True
nextKey = jsonData["metadata"]["cursor"]
while has_next_cursor:
url2 = f"https://us.api.insight.rapid7.com/vm/v4/integration/assets?&size=500&cursor={nextKey}"
resp2 = requests.post(url=url2, headers=headers, json=third_party_patching_filer, verify=False).json()
cursor = resp2["metadata"]
print(cursor)
if "cursor" in cursor:
nextKey = cursor["cursor"]
print(f"next key {nextKey}")
#print(desktop_support)
for data in resp2["data"]:
for tags in data['tags']:
total_critical_vul_osswin = []
total_severe_vul_osswin = []
total_modoer_vuln_osswin = []
if tags["name"] == 'OSSWIN':
print("OSSWIN")
critical_vuln_osswin = data['critical_vulnerabilities']
severe_vuln_osswin = data['severe_vulnerabilities']
modoer_vuln_osswin = data['moderate_vulnerabilities']
total_critical_vul_osswin.append(critical_vuln_osswin)
total_severe_vul_osswin.append(severe_vuln_osswin)
total_modoer_vuln_osswin.append(modoer_vuln_osswin)
print(sum(total_critical_vul_osswin))
print(sum(total_severe_vul_osswin))
print(sum(total_modoer_vuln_osswin))
if tags["name"] == 'DESKTOP_SUPPORT':
print("Desktop")
total_critical_vul_desktop = []
total_severe_vul_desktop = []
total_modorate_vuln_desktop = []
critical_vuln_desktop = data['critical_vulnerabilities']
severe_vuln_desktop = data['severe_vulnerabilities']
moderate_vuln_desktop = data['moderate_vulnerabilities']
total_critical_vul_desktop.append(critical_vuln_desktop)
total_severe_vul_desktop.append(severe_vuln_desktop)
total_modorate_vuln_desktop.append(moderate_vuln_desktop)
print(sum(total_critical_vul_desktop))
print(sum(total_severe_vul_desktop))
print(sum(total_modorate_vuln_desktop))
else:
pass
else:
has_next_cursor = False
If you have a lot of parameters to pass, consider using a dict to combine them. Then you can just return the dict and pass it along to the next function that needs that data. Another approach would be to create a class and either access the variables directly or have helper functions that do so. The latter is a cleaner solution vs a dict, since with a dict you have to quote every variable name, and with a class you can easily add additional functionally beyond just being a container for a bunch of instance variables.
If you want the total across all the data, you should put these initializations:
total_critical_vul_osswin = []
total_severe_vul_osswin = []
total_modoer_vuln_osswin = []
before the while has_next_cursor loop (and similarly for the desktop totals). The way your code is currently, they are initialized each cursor (ie, each 500 samples based on the URL).

Updating multiple attributes of a python object - unit testing

import requests
class RadioInfo:
def __init__(self, url, station_num=0):
self.url = url
self.station_num = station_num
def update(self):
radio_info = self.get_json()
self.update_info(radio_info)
def get_json(self):
r = requests.get(self.url)
return r.json()[self.station_num]
def update_info(self, radio_info):
self.radio_station_name = radio_info["station"]["name"]
self.radio_station_description = radio_info["station"]["description"]
self.current_listeners = radio_info["listeners"]["current"]
self.unique_listeners = radio_info["listeners"]["unique"]
self.total_listeners = radio_info["listeners"]["total"]
self.now_playing_playlist = radio_info["now_playing"]["playlist"]
self.now_playing_song_title = radio_info["now_playing"]["song"]["title"]
self.now_playing_song_artist = radio_info["now_playing"]["song"]["artist"]
self.now_playing_song_album = radio_info["now_playing"]["song"]["album"]
###
# What if radio["playing_next"] is None ??
###
self.next_playing_playlist = radio_info["playing_next"]["playlist"]
self.next_playing_song_title = radio_info["playing_next"]["song"]["title"]
self.next_playing_song_artist = radio_info["playing_next"]["song"]["artist"]
self.next_playing_song_album = radio_info["playing_next"]["song"]["album"]
Having the above, trying to figure out how to refactor this, so that I can check if some of the nodes are None and handle this accordingly.
Additionally starting out with unit testing, and besides error handling was thinking about best practices of unit testing this object.
Should I create a method for each property and unit test with pytest.mark.parametrize for example, or is this considered redundant and not necessary ?

SQLAlchemy with_entities with dynamic inputs

how can I provide the with_entities option with dynamic input?
At the moment I have to do this:
columns = [DataModel.col1, DataModel.col2, ...]
data = DataModel.query.order_by(DataModel.id.desc()).with_entities(*columns).first()
But what should I do, if I get the col name as query-string parameters and have to define it dynamically?
EDIT EDIT EDIT
solved it this way:
In my model I define the classmethod:
#classmethod
def find_by_filter(cls, attr):
search_columns = [getattr(cls, i) for i in attr]
return cls.query.order_by(cls.id.desc()).with_entities(*search_columns).first()
and then I can call it from my Rest API this way:
liste = ["column 1", "column2", "column3", "column4"]
data = DataModel.find_by_filter(liste)
I had this same challenge and this is how I solved it.
columns is just a CSV string eg "field1,field2,field3"
Could be further condensed but this is enough for readability.
def pick_Columns(modelClass, columns):
if len(columns) == 0:
return None
colArray = columns.split(',')
modelObjects = [eval(f'{modelClass.__name__}.{col}') for col in colArray]
return modelObjects
def Get_Data(**kwargs):
retVal = kwargs['db'].query(m_DataModel)
if kwargs.get('columns', None) is not None:
colObj = pick_Columns(m_DataModel, kwargs['columns'])
retVal = retVal.with_entities(*colObj)
retVal = [row._asdict() for row in retVal.all()] # with_entities returns a tuple instead of dict. Remove this line if that's what you want
else:
retVal = retVal.all()
return retVal

load the csv file into Big query auto detect schema using python API

I'm trying to load the CSV file with schema under auto detection but I am unable to load the file into Big query. Can any one help me on this.
Please find my code below:
def load_data_from_file(dataset_name, table_name, source_file_name):
bigquery_client = bigquery.Client()
dataset = bigquery_client.dataset(dataset_name)
table = dataset.table(table_name)
table.reload()
with open(source_file_name, 'rb') as source_file:
job = table.upload_from_file(
source_file, source_format='text/csv')
wait_for_job(job)
print('Loaded {} rows into {}:{}.'.format(
job.output_rows, dataset_name, table_name))
def wait_for_job(job):
while True:
job.reload()
if job.state == 'DONE':
if job.error_result:
raise RuntimeError(job.errors)
return
time.sleep(1)
Based on the Google BigQuery python API documentation, you should set source_format to 'CSV' instead of 'text/csv':
source_format='CSV'
Code Sample:
with open(csv_file.name, 'rb') as readable:
table.upload_from_file(
readable, source_format='CSV', skip_leading_rows=1)
Source: https://googlecloudplatform.github.io/google-cloud-python/stable/bigquery-usage.html#datasets
If this does not solve your problem, please provide more specifics about the errors you are observing.
You can use the below code snippet to create and load data (CSV format) from Cloud Storage to BigQuery with auto-detect schema:
from google.cloud import bigquery
bigqueryClient = bigquery.Client()
jobConfig = bigquery.LoadJobConfig()
jobConfig.skip_leading_rows = 1
jobConfig.source_format = bigquery.SourceFormat.CSV
jobConfig.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
jobConfig.autodetect=True
datasetName = "dataset-name"
targetTable = "table_name"
uri = "gs://bucket_name/file.csv"
tableRef = bigqueryClient.dataset(datasetName).table(targetTable)
bigqueryJob = bigqueryClient.load_table_from_uri(uri, tableRef, job_config=jobConfig)
bigqueryJob.result()
Currently, the Python Client has no support for loading data from file with a schema auto-detection flag (I plan on doing a pull request to add this support but still I'd like to talk to the maintainers what their opinions are on this implementation).
There are still some ways to work this around. I didn't find a very elegant solution so far but nevertheless this code allows you to add schema detection as input flag:
from google.cloud.bigquery import Client
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'path/your/json.key'
import google.cloud.bigquery.table as mtable
def _configure_job_metadata(metadata,
allow_jagged_rows,
allow_quoted_newlines,
create_disposition,
encoding,
field_delimiter,
ignore_unknown_values,
max_bad_records,
quote_character,
skip_leading_rows,
write_disposition):
load_config = metadata['configuration']['load']
if allow_jagged_rows is not None:
load_config['allowJaggedRows'] = allow_jagged_rows
if allow_quoted_newlines is not None:
load_config['allowQuotedNewlines'] = allow_quoted_newlines
if create_disposition is not None:
load_config['createDisposition'] = create_disposition
if encoding is not None:
load_config['encoding'] = encoding
if field_delimiter is not None:
load_config['fieldDelimiter'] = field_delimiter
if ignore_unknown_values is not None:
load_config['ignoreUnknownValues'] = ignore_unknown_values
if max_bad_records is not None:
load_config['maxBadRecords'] = max_bad_records
if quote_character is not None:
load_config['quote'] = quote_character
if skip_leading_rows is not None:
load_config['skipLeadingRows'] = skip_leading_rows
if write_disposition is not None:
load_config['writeDisposition'] = write_disposition
load_config['autodetect'] = True # --> Here you can add the option for schema auto-detection
mtable._configure_job_metadata = _configure_job_metadata
bq_client = Client()
ds = bq_client.dataset('dataset_name')
ds.table = lambda: mtable.Table('table_name', ds)
table = ds.table()
with open(source_file_name, 'rb') as source_file:
job = table.upload_from_file(
source_file, source_format='text/csv')
Just wanted to show how i've used the python client.
Below is my function to create a table and load it with a csv file.
Also, self.client is my bigquery.Client()
def insertTable(self, datasetName, tableName, csvFilePath, schema=None):
"""
This function creates a table in given dataset in our default project
and inserts the data given via a csv file.
:param datasetName: The name of the dataset to be created
:param tableName: The name of the dataset in which the table needs to be created
:param csvFilePath: The path of the file to be inserted
:param schema: The schema of the table to be created
:return: returns nothing
"""
csv_file = open(csvFilePath, 'rb')
dataset_ref = self.client.dataset(datasetName)
# <import>: from google.cloud.bigquery import Dataset
dataset = Dataset(dataset_ref)
table_ref = dataset.table(tableName)
if schema is not None:
table = bigquery.Table(table_ref,schema)
else:
table = bigquery.Table(table_ref)
try:
self.client.delete_table(table)
except:
pass
table = self.client.create_table(table)
# <import>: from google.cloud.bigquery import LoadJobConfig
job_config = LoadJobConfig()
table_ref = dataset.table(tableName)
job_config.source_format = 'CSV'
job_config.skip_leading_rows = 1
job_config.autodetect = True
job = self.client.load_table_from_file(
csv_file, table_ref, job_config=job_config)
job.result()
Let me know if this solves your problem.

pymongo- upsert not able to perform insertion with $set operation

I am having an empty collection and have thousands of entries to process (entries might have redudancy for which I want to use both updates and inserts).
The python code (using pymongo) I wrote:
for mydoc in alldocs:
key = {'myid': mydoc['myid']}
data = process_doc(mydoc) # returns simple dictionary
db.mydocs.update(key, {"$set": data}, upsert = True)
The following code is unable to perform any insert operations. The collection still remains empty. But when I remove $set and use simply data, it works fine. Can't I use $set in upsert? The reason why I want $set was so that pre-existing fields for a BSON doesn't get affected. Can someone please guide. I really can't figure out what to do.
Reproducable code:
from pymongo import Connection
DB_CONTENT_BASE_KEY = 'contentbase'
def connect_to_db(dbname, hostname = 'localhost', portno = 27017, **kwargs):
connection = Connection(hostname, portno)
dbConnection = connection[dbname]
return dbConnection
class MetawebCustomCollectionBuilder(object):
# key ought to be a dictionary to filter results from contentbase.
def __init__(self, inDbConfig, outDbConfig, key = {}, verbose = False):
self.verbose = verbose
self.inDbConfig = inDbConfig
self.inDb = connect_to_db(**inDbConfig)
self.outDbConfig = outDbConfig
self.outDb = connect_to_db(**outDbConfig)
self.inDbContentBase = self.inDb[self.inDbConfig[DB_CONTENT_BASE_KEY]]
self.outDbContentBase = self.outDb[self.outDbConfig[DB_CONTENT_BASE_KEY]]
self.key = key
self.in_db_collection_constraints()
self.out_db_collection_constraints()
def in_db_collection_constraints(self):
self.inDbContentBase.ensure_index('mid')
if self.verbose: print("Assured index on mid for inDbContentBase...")
def out_db_collection_constraints(self):
self.outDbContentBase.ensure_index('mid')
if self.verbose: print("Assured index on mid for outDbContentBase...")
def process_in_record(self, inRecord):
outRecord = inRecord # [YET TO] continue from here...
return outRecord
def transit_collection(self):
for record in self.inDbContentBase.find(self.key):
outRecord = self.process_in_record(record)
key = {'mid':outRecord['mid']}
data = outRecord
print key
self.outDbContentBase.update(key, {"$set": data}, True)
if self.verbose: print 'Done with transiting collection from in DB to out DB'
def cleanup_out_collection(self):
pass
def in_db_sandbox(self):
# To have tests and analytics placed in here corresponding to inDb.
pass
if __name__ == '__main__':
inDbConfig = {'dbname':'metaweb', 'contentbase': 'content'}
outDbConfig = {'dbname': 'similarkind', 'contentbase': 'content'}
mccb = MetawebCustomCollectionBuilder(inDbConfig, outDbConfig, verbose = True)
mccb.transit_collection()
There must be a prexisting database inDb. From this collection I want to create a new modified collection.
Your claim is wrong
>>> import pymongo
>>> c = pymongo.Connection()
>>> db = c.mydb
>>> db.mydocs.find().count()
0
>>> db.mydocs.update({'myid': '438'}, {"$set": {'keyA':'valueA'}}, upsert = True)
>>> db.mydocs.find().count()
1
>>> db.mydocs.find_one()
{u'myid': u'438', u'keyA': u'valueA', u'_id': ObjectId('504c2fd1a694cc9624bbd6a2')}

Categories

Resources