python class managing in another class - python

I have a class called martCrawler which import other 3 crawlers from other files
However, the code becomes too long as the crawler imported increased.
Is there a better practice managing the code like this?
Thanks in advance~
from Scripts_mart.wat_ver2 import crawler_watsons
from Scripts_mart.cosmed_ver1 import crawler_cosmed
from Scripts_mart.pxmart_ver1 import crawler_pxmart
import datetime
class martCrawler():
def __init__(self):
self.wat = crawler_watsons()
self.cos = crawler_cosmed()
self.pxm = crawler_pxmart()
def crawler_testing(self):
result_pack = {}
wat_result = self.wat.run_before_insert()
cos_result = self.cos.run_before_insert()
pxm_result = self.pxm.run_before_insert()
result_pack['wat'] = wat_result
result_pack['cos'] = cos_result
result_pack['pxm'] = pxm_result
return result_pack
...

Then why not store all the crawlers in a dict from the beginning?
As an example:
from Scripts_mart.wat_ver2 import crawler_watsons
from Scripts_mart.cosmed_ver1 import crawler_cosmed
from Scripts_mart.pxmart_ver1 import crawler_pxmart
class MartCrawler():
def __init__(self, *args):
self.crawlers = {}
for crawler in args:
# use some introspection to get the name of the classes.
# The names should be "crawler_watsons", etc
self.crawlers[crawler.__name__] = crawler()
def crawler_testing(self):
result_pack = {}
for name, crawler in self.crawlers.items():
result_back[name] = crawler.run_before_insert()
return result_back
martCrawler = MartCrawler(crawler_watsons, crawler_cosmed, crawler_pxmart)
Just have in mind that the names in your dict will be the actual names of the imported classes, not 'wat', 'pos' and 'pxm'. But if this is a problem you can use some string manipulation and/or regexes to fix it.
For example you could replace crawler.__name__ with crawler.__name__[8:11]

Just put them into a dictionary:
from Scripts_mart.wat_ver2 import crawler_watsons
from Scripts_mart.cosmed_ver1 import crawler_cosmed
from Scripts_mart.pxmart_ver1 import crawler_pxmart
import datetime
class martCrawler():
def __init__(self):
self.crawlers = {
"wat": crawler_watsons(),
"cos": crawler_cosmed(),
"pxm": crawler_pxmart()
}
def crawler_testing(self):
result_pack = {}
for key in self.crawlers:
result_pack[key] = self.crawlers[key].run_before_insert()
return result_pack

Related

How do I test start_server_http is working for Prometheus

I have created this monitoring class that updates some Counter Metrics according to some logic. I have attached the code. Please can someone explain to me why would my registry be empty even after I add the test metric.
import logging
from prometheus_client import (
CollectorRegistry,
Counter,
start_http_server
)
class Reporter:
def __init__(self):
self._set_counters()
start_http_server(8080, registry=self.registry)
def _set_counters(self):
self.registry = CollectorRegistry()
self.bycounter = Counter(
'bycounter',
'blah blah',
['by', 'level0top'],
registry=self.registry
)
self.bycounter.labels(by='test', level0top='test').inc()
I am trying to test the metrics like
import unittest
from sc_eol.monitoring import TodayDataReporter
from sc_eol.sc_eol_utils import generate_query_url
reporter = TodayDataReporter()
class TestTodayDataReporter(unittest.TestCase):
#staticmethod
def test_publish():
by = 'level1'
parse_query = {'level0top' : 'WSJ2', 'date' : '2021-11-01'}
start = '2021-11-01'
print(dir(reporter.registry))
reporter.registry.collect()
before = reporter.registry.get_sample_value('bycounter', ['level1', 'WSJ2'])
print("BEFOREEE", before)
reporter.registry.collect()
generate_query_url(by, start, parse_query, reporter)
before = reporter.registry.get_sample_value('bycounter', {'by':'level1', 'level0top': 'WSJ2'})
reporter.registry.collect()
print("After", before)
if __name__ == "__main__":
unittest.main()
Why is the bycounter None?
How do I test if a server is running at port 8080 or not
before = reporter.registry.get_sample_value('bycounter', ['level1', 'WSJ2'])
The counter is being renamed as bycounter_total

python class: run functions step by step and save them

I have a class that reads a dataframe and then another class which processes that dataframe. the functions in the processing class should be applied on the same dataframe step by step to shape the final dataframe which is then saved as a csv file.
from pydantic import BaseModel
from config import DATA_REPO
import pandas as pd
import os
class PandaDataFrame(BaseModel):
data: pd.DataFrame
class Config:
arbitrary_types_allowed = True
class Directory(BaseModel):
data_directory: str
class DataToPandaReader(object):
def csv_file_reader(self, directory: Directory):
directory = directory.data_directory
for file in os.listdir(directory):
if file.endswith('.csv'):
return pd.read_csv(os.path.join(directory, file))
class DataProcessor(object):
def remove_punctuation(self, my_: PandaDataFrame):
my_data_to_process = my_.data
for col in my_data_to_process:
if any(word in col for word in ['example', 'text', 'Answer']):
my_data_to_process = my_data_to_process[col].str.replace('[^\w\s]', '', regex=True)
return add_number_column(my_data_to_process)
def add_number_column(self, my_: PandaDataFrame):
my_data_to_process = my_.data
my_data_to_process['sentence_number'] = range(len(my_data_to_process))
return save_final_dataframe(my_data_to_process)
def save_final_dataframe(self, my_:PandaDataFrame):
my_data_to_process = my_.data
return my_data_to_process.to_csv('final_data.csv')
def parse_data_process(directory_to_csv_file):
toprocess = DataProcessor()
toprocess.save_final_dataframe(directory_to_csv_file)
toprocess.remove_punctuation(directory_to_csv_file)
toprocess.add_number_column(directory_to_csv_file)
return toprocess
if __name__ == '__main__':
parse_data_process(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))
now, for example to instantiate the first function in DataProcessor class, I would do the following
DataProcessor().remove_punctuation(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))
but my intention is to run all these function in the DataProcessor class step by step, so the save_final_dataset function would save the dataframe that is has its punctuation removed and also has a number column.
update:
following the answer given, I made these changes, but get the error that the functions are not known.
def parse_data_process(directory_to_csv_file):
toprocess = DataProcessor()
toprocess.save_final_dataframe(directory_to_csv_file)
toprocess.remove_punctuation(directory_to_csv_file)
toprocess.add_number_column(directory_to_csv_file)
return toprocess
if __name__ == '__main__':
parse_data_process(PandaDataFrame(data= DataToPandaReader().csv_file_reader(Directory(data_directory = os.path.join(DATA_REPO, 'input_data')))))
Unless I've misunderstood your use-case, all you need to do is replace
return my_data_to_process
...in the remove_punctuation function with
return add_number_column(my_data_to_process)
...then replace
return my_data_to_process
...in the add_number_column function with
return save_final_dataframe(my_data_to_process)

Selecting a Design Pattern in Python

The below code I modified from Multiple Inheritance to Facade Data Pattern , however it is still breaking the concept of the Facade Data pattern as my subsystems(ES) are sharing amongst themselves .
All the ES are working on a Structured data stream and enriching the data together , they all are run in Async and then gathered into a list (using async gather) . I wanted to know which data pattern suits this
Used case ,
Where I can keep adding ES as per my requirement.
Each ES can share data amongst itself like dictonary .
And I if I have to add new functionality I follow "Single Responsibility Principle"
Multiple Inheritance
import os
import asyncio
import psycopg2
import websockets
from datetime import datetime
from websockets.extensions import permessage_deflate
from structure import Structure
import sys
sys.path.append('..')
from event_stations.es3a import ES3A
from event_stations.es3b import ES3B
from event_stations.es3c import ES3C
from event_stations.es3d import ES3D
from event_stations.es1a import ES1A
from event_stations.es1b import ES1B
from event_stations.es2a import ES2A
from event_stations.es2b import ES2B
class FR_Server(ES1A, ES2A, ES2B, ES3A, ES3B, ES3C, ES3D, Structure):
unique_id = "100"
event_format_list = []
fr_config_table = 'detail_event.app_fer_config'
psql_db = None
def __init__(self):
print("Receiver Called INIT")
self.psql_db = self.connect_to_psql()
self._get_metadata()
super(FR_Server, self).__init__()
# self.start_receiver(self.mapped_dict)
def connect_to_psql(self):
db = psycopg2.connect("dbname=trimble user=postgres password=admin")
return db
def _get_parent_classname(self):
_parent_classes = []
_cls_obj = eval("FR_Server")
for parent in _cls_obj.__bases__:
_parent_classes.append(parent.__name__)
return _parent_classes
def _get_metadata(self):
_parents = self._get_parent_classname()
print(_parents, "pppp")
cur = self.psql_db.cursor()
cur.execute(f"Select * from {self.fr_config_table}")
results = cur.fetchall()
for result in results:
event_station_id = result[0]
if event_station_id in _parents:
event_station_classname = eval(event_station_id)
setattr(event_station_classname, "cache_table_name", result[1])
setattr(event_station_classname, "ignite_port", result[2])
setattr(event_station_classname, "ignite_host", result[3])
def get_port(self):
return os.getenv('WS_PORT', '10011')
def get_host(self):
return os.getenv('WS_HOST', 'localhost')
async def start(self):
return await websockets.serve(self.handler, self.get_host(), self.get_port(), ping_interval=None, max_size=None,
max_queue=None, close_timeout=None, extensions=[
permessage_deflate.ServerPerMessageDeflateFactory(
server_max_window_bits=11,
client_max_window_bits=11,
compress_settings={'memLevel': 4},
),
])
def generate_event_id(self, index):
return "".join(['%02d%02d%d%d%d%d%d' % (datetime.now().day, datetime.now().month, datetime.now().year,
datetime.now().hour,datetime.now().minute, datetime.now().second,
datetime.now().microsecond), self.unique_id,index])
async def handler(self, websocket, path):
async with websockets.connect('ws://localhost:10015', ping_interval=None, max_size=None,
max_queue=None, close_timeout=None,
extensions=[permessage_deflate.ClientPerMessageDeflateFactory(
server_max_window_bits=11,
client_max_window_bits=11,
compress_settings={'memLevel': 4},
),
]) as websocket_rb:
async for row in websocket:
lst_row = row.decode().split(",")
uid = self.generate_event_id(lst_row[0])
lst_row = [uid] + lst_row
results = await asyncio.gather(self.enrich_column_es3a_dict(lst_row[1]),
self.enrich_column_es3b_dict(lst_row[1]),
self.enrich_column_es3c_dict(lst_row[1]),
self.enrich_column_es3d_dict(lst_row[1]))
await websocket_rb.send(str(lst_row + results).encode())
def start_receiver(self, mapped_list):
self.event_format_list = mapped_list
asyncio.get_event_loop().run_until_complete(self.start())
asyncio.get_event_loop().run_forever()
Facade Data pattern :
from __future__ import annotations
from event_stations.es1a import ES1A
from event_stations.es2a import ES2A
from event_stations.es2b import ES2B
import psycopg2
class Foundation_Facade(object):
psql_db = None
client = None
def __init__(self, es1a: ES1A, es2a: ES2A) -> None:
self._es1a = es1a or ES1A()
self._es2a = es2a or ES2A()
def operation(self):
print("Called")
results = []
self.psql_db = self._es1a._connect_psql()
self._es1a._get_metadata(self.psql_db.cursor())
self.client = self._es1a.connect_ignite_client(self._es1a.ignite_host, self._es1a.ignite_port)
self._es2a._get_metadata(self.psql_db.cursor())
self._es2a.put_data(self.client)
self._es2b._get_metadata(self.psql_db.cursor())
self._es2b.put_data(self.client)
print(self._es2b.static_df.head())
# results.append(self._es1a._get_metadata())
return results
if __name__ == '__main__':
es1a = ES1A()
es2a = ES2A()
es2b = ES2B()
facade = Foundation_Facade(es1a, es2a)
from fr_server_1 import Server
Server(facade)

Python mock func of an obj

Hello my name is UserName and I can't mock =(
I have next code:
app/worker/dataObj.py
class DataObj:
def __init__(self, data: dict):
if 'assignee' in data:
self.users = self.get_users()
print(self.users)
def get_users(self):
users = requests.get(url=URL, headers=HEADERS)
return json.loads(users.text)
app/test/test_dataObj.py
import unittest
from app.worker.dataObj import DataObj
app.test.test_data import test_data
unittest.mock import patch
TestDataObj(unittest.TestCase):
setUp(self):
data = test_data.data_for_dataobj
dataobj = DataObj(self.data)
#patch.object(DataObj, 'get_users')
test_dataobj(self, gu):
gu.return_value = {'user1': 111, 'User2': 222}
print(gu)
self.assertEqual(self.dataobj.tags, ["11", "22", "401", "88888"])
I also try to path like this:
#patch("app.worker.dataObj.requests.get")
#patch("app.worker.dataObj.DataObj.get_users")
but it still doesn't mock
I've read different related questions here, and it seems as a path issue, but can't figured out.
OK. I stop trying to test realisation instead I start to test interface.
#patch("app.worker.dataObj.requests.get")
def test_dataobj_assignee(self, mock_req):
mock_req.return_value.text = json.dumps(test_data.get_users_return)
self.dataobj = DataObj(self.test_data.assignee)
self.assertEqual(self.dataobj.assignee, 1130000021382371)
Hopefully it will helpful for somebody.

Reset index name in elasticsearch dsl

I'm trying to create an ETL that extracts from mongo, process the data and loads into elastic. I will do a daily load so I thought of naming my index with the current date. This will help me for a later processing I need to do with this first index.
I used elasticsearch dsl guide: https://elasticsearch-dsl.readthedocs.io/en/latest/persistence.html
The problem that I have comes from my little experience with working with classes. I don't know how to reset the Index name from the class.
Here is my code for the class (custom_indices.py):
from elasticsearch_dsl import Document, Date, Integer, Keyword, Text
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl import Search
import datetime
class News(Document):
title = Text(analyzer='standard', fields={'raw': Keyword()})
manual_tagging = Keyword()
class Index:
name = 'processed_news_'+datetime.datetime.now().strftime("%Y%m%d")
def save(self, ** kwargs):
return super(News, self).save(** kwargs)
def is_published(self):
return datetime.now() >= self.processed
And this is the part of the code where I create the instance to that class:
from custom_indices import News
import elasticsearch
import elasticsearch_dsl
from elasticsearch_dsl.connections import connections
import pandas as pd
import datetime
connections.create_connection(hosts=['localhost'])
News.init()
for index, doc in df.iterrows():
new_insert = News(meta={'id': doc.url_hashed},
title = doc.title,
manual_tagging = doc.customTags,
)
new_insert.save()
Every time I call the "News" class I would expect to have a new name. However, the name doesn't change even if I load the class again (from custom_indices import News). I know this is only a problem I have when testing but I'd like to know how to force that "reset". Actually, I originally wanted to change the name outside the class with this line right before the loop:
News.Index.name = "NEW_NAME"
However, that didn't work. I was still seeing the name defined on the class.
Could anyone please assist?
Many thanks!
PS: this must be just an object oriented programming issue. Apologies for my ignorance on the subject.
Maybe you could take advantage of the fact that Document.init() accepts an index keyword argument. If you want the index name to get set automatically, you could implement init() in the News class and call super().init(...) in your implementation.
A simplified example (python 3.x):
from elasticsearch_dsl import Document
from elasticsearch_dsl.connections import connections
import datetime
class News(Document):
#classmethod
def init(cls, index=None, using=None):
index_name = index or 'processed_news_' + datetime.datetime.now().strftime("%Y%m%d")
return super().init(index=index_name, using=using)
You can override the index when you call save() .
new_insert.save('processed_news_' + datetime.datetime.now().strftime("%Y%m%d"))
Example as following.
# coding: utf-8
import datetime
from elasticsearch_dsl import Keyword, Text, \
Index, Document, Date
from elasticsearch_dsl.connections import connections
HOST = "localhost:9200"
index_names = [
"foo-log-",
"bar-log-",
]
default_settings = {"number_of_shards": 4, "number_of_replicas": 1}
index_settings = {
"foo-log-": {
"number_of_shards": 40,
"number_of_replicas": 1
}
}
class LogDoc(Document):
level = Keyword(ignore_above=256)
date = Date(format="yyyy-MM-dd'T'HH:mm:ss.SSS")
hostname = Text(fields={'fields': Keyword(ignore_above=256)})
message = Text()
createTime = Date(format="yyyy-MM-dd'T'HH:mm:ss.SSS")
def auto_create_index():
'''自动创建ES索引'''
connections.create_connection(hosts=[HOST])
for day in range(3):
dt = datetime.datetime.now() + datetime.timedelta(days=day)
for index in index_names:
name = index + dt.strftime("%Y-%m-%d")
settings = index_settings.get(index, default_settings)
idx = Index(name=name)
idx.document(LogDoc)
idx.settings(**settings)
try:
idx.create()
except Exception as e:
print(e)
continue
print("create index %s" % name)
if __name__ == '__main__':
auto_create_index()

Categories

Resources