I'm trying to get vars of specified host including his host/group vars but I can't get the vars inside group var file
For example:
from ansible.parsing.dataloader import DataLoader
from ansible.vars.manager import VariableManager
from ansible.inventory.manager import InventoryManager
from ansible.vars.manager import Host
class AnsibleConnector:
def __init__(self):
self.ansible_inventory_file_path = "inventory.yml"
self.loader = DataLoader()
self.inventory = InventoryManager(self.loader, sources=self.ansible_inventory_file_path)
self.variable_manager = VariableManager(loader=self.loader, inventory=self.inventory)
def get_host_vars(self, host, include_group_vars=False, groups=None):
if groups is None:
groups = []
group_vars = {}
if include_group_vars:
for group in groups:
if isinstance(host, str):
host = Host(name=host)
# combine 3 dict in 1 with priority of host_vars (as ansible does)
host_vars = {**group_vars, **self.variable_manager.get_vars(host=host, include_hostvars=True), **self.inventory.get_host(host.name).vars}
return host_vars
def get_group_vars(self, group):
return self.inventory.groups[group].get_vars()
if __name__ == "__main__":
ansible_connector = AnsibleConnector()
ansible_connector.get_host_vars('hpserver01', True, ['hp-servers'])
The code above gives me the host var of hpserver01 and gives me the group vars that are only inside my inventory.ini file but I can't get vars that are in groups_vars/hp-servers.yml
How can I get those vars?
I've searched in many threads, but I couldn't find any reference for how to get the group vars file.
I tried using get_group_vars from ansible.helpers but it didn't help...
Let's assume I have the following file structure:
foo = []
bar = []
abc = "def"
import data
# do something here #
# a = ...
print a
# ['foo', 'bar', 'abc']
I need to get all the variables defined in data.py file. How can I achieve that? I could use dir(), but it returns all the attributes of the module including __name__ and so on.
print [item for item in dir(adfix) if not item.startswith("__")]
Is usually the recipe for doing this, but it begs the question.
# coding: utf-8
__author__ = 'spouk'
def get_book_variable_module_name(module_name):
module = globals().get(module_name, None)
book = {}
if module:
book = {key: value for key, value in module.__dict__.iteritems() if not (key.startswith('__') or key.startswith('_'))}
return book
import config
book = get_book_variable_module_name('config')
for key, value in book.iteritems():
print "{:<30}{:<100}".format(key, value)
example config
# coding: utf-8
__author__ = 'spouk'
import os
_basedir = os.path.abspath(os.path.dirname(__file__))
# database section MYSQL section
DBHOST = 'localhost'
DBNAME = 'simple_domain'
DBPORT = 3306
DBUSER = 'root'
DBPASS = 'root'
# global section
DEBUG = True
HOSTNAME = 'simpledomain.com'
HOST = ''
PORT = 3000
ADMINS = frozenset(['admin#localhost'])
SECRET_KEY = 'dfg45DFcx4rty'
CSRF_SESSION_KEY = "simplekey"
result function
/usr/local/bin/python2 /home/spouk/develop/python/2015/utils_2015/parse_config_py.py
os <module 'os' from '/usr/local/lib/python2.7/os.pyc'>
DBHOST localhost
HOSTNAME simpledomain.com
PORT 3000
ADMINS frozenset(['admin#localhost'])
SECRET_KEY dfg45DFcx4rty
DBNAME simple_domain
Process finished with exit code 0
Enjoy, dude. :)
I have to make a dictionary of these variables. I used this code.
print({item:getattr(my_module, item) for item in dir(my_module) if not item.startswith("__") and not item.endswith("__")})
This is the version I wrote for python 3.7 (it excludes the internal dunder methods via the condition in the comprehension)
print([v for v in dir(data) if v[:2] != "__"])
A longer but complete working example is below:
"""an example of a config file whose variables may be accessed externally"""
# Module variables
server_address = ""
server_port = 8010
server_to_client_port = 8020
client_to_server_port = 8030
client_buffer_length = 4096
server_buffer_length = 2048
def printVariables(variable_names):
"""Renders variables and their values on the terminal."""
max_name_len = max([len(k) for k in variable_names])
max_val_len = max([len(str(globals()[k])) for k in variable_names])
for k in variable_names:
print(f' {k:<{max_name_len}}: {globals()[k]:>{max_val_len}}')
if __name__ == "__main__":
ks = [k for k in dir() if (k[:2] != "__" and not callable(globals()[k]))]
The above code outputs:
an example of a config file whose variables may be accessed externally
client_buffer_length : 4096
client_to_server_port: 8030
server_address :
server_buffer_length : 2048
server_port : 8010
server_to_client_port: 8020
I offer my solution. It is convenient in that it allows you to display variables from any imported module.
If you do not specify the name of the module, then the list of variables of the current module is displayed.
import sys
def print_settings(module_name=None):
module_name = sys.modules[__name__] if not module_name else module_name
variables = [
(key, value)
for (key, value) in vars(module_name).items()
if (type(value) == str or type(value) == int or type(value) == float)
and not key.startswith("_")
for (key, value) in variables:
print(f"{key: <20} {value}")
If you need the variable and the value assigned to it then
import data
for name ,values in vars(data).items():
print(name, values)
You can choose to store name (all the variable names in the script) or the value attached to it .
Here's a roundabout way, if you prefer to be more explicit:
a = [
foo := [],
bar := [],
abc := "def",
import data
for vars in dir():
if vars.startswith("var"):
print vars
I am looking to fetch and publish data from spark streaming onto cloudant. My code is as follows -
from CloudantPublisher import CloudantPublisher
from CloudantFetcher import CloudantFetcher
import pyspark
from pyspark.streaming.kafka import KafkaUtils
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from kafka import KafkaConsumer, KafkaProducer
import json
class SampleFramework():
def __init__(self):
def messageHandler(m):
return json.loads(m.message)
def processData(rdd):
if (rdd.isEmpty()):
SampleFramework.logger.info("RDD is empty")
# Expand
expanded_rdd = rdd.mapPartitions(CloudantFetcher.fetch)
def run(self, ssc):
self.ssc = ssc
directKafkaStream = KafkaUtils.createDirectStream(self.ssc, SUBSCRIBE_QUEUE], \
{"metadata.broker.list": METADATA, \
"bootstrap.servers": BOOTSTRAP_SERVERS}, \
messageHandler= SampleFramework.messageHandler)
Other supporting classes -
from CloudantConnector import CloudantConnector
class CloudantFetcher:
config = Config.createConfig()
cloudantConnector = CloudantConnector(config)
def fetch(data):
final_data = []
for row in data:
id = row["id"]
if(not CloudantFetcher.cloudantConnector.isReady()):
data_json = CloudantFetcher.cloudantConnector.getOne({"id": id})
row["data"] = data_json
return final_data
class CloudantPublisher:
config = Config.createConfig()
cloudantConnector = CloudantConnector(config)
def publish(data):
from cloudant.client import Cloudant
from cloudant.result import Result
from cloudant.result import QueryResult
from cloudant.document import Document
from cloudant.query import Query
from cloudant.database import CloudantDatabase
import json
class CloudantConnector:
def __init__(self, config, db_name):
self.config = config
self.client = Cloudant(self.config["cloudant"]["credentials"]["username"], self.config["cloudant"]["credentials"]["password"], url=self.config["cloudant"]["host"]["full"])
self.initialized = False
self.db_name = self.config["cloudant"]["host"]["db_name"]
def open(self):
self.logger.info("Connection to Cloudant established.")
self.initialized = True
raise Exception("Could not connect to Cloudant! Please verify credentials.")
self.database = CloudantDatabase(self.client,self.db_name)
if self.database.exists():
def isReady(self):
return self.initialized
def close(self):
def getOne(self, query):
new_filter = query
query = Query(self.database, selector = query, limit=1)
results_string = json.dumps(query.result[0][0])
results_json = json.loads(results_string)
return results_json
def postAll(self, docs):
documents = []
quantum = self.config["cloudant"]["constants"]["bulk_quantum"]
count = 0
for doc in docs:
document = Document(self.database)
document["id"] = doc["id"]
document["data"] = doc["data"]
count = count + 1
documents = []
self.logger.debug("Uploaded document to the Cloudant database.")
My implementation works, but it's slow as compared to what I would expect in the case of not initializing the cloudant connection in each partition and maintaining a static source of these connection threads which can be passed on to each partition to use/ fetched by each partition to use.
My Questions are as follows:
Do I need to create a connection pool with cloudant 2.0 API in python? (It seems that it already exists within the API). If yes, then how should I go about it? The closest I have seen an implementation is this - link, but it's on an outdated cloudant api and does not work with the new one.
If the answer to the above is 'Yes', How can I make this accessible to the workers? I see references to creating serializable, lazily instantiated connection-client objects here. This would mean that I would make a lazily instantiated cloudant connection object in the SampleFramework. How can I do this in Python? Just like given in the spark documentation.
connection = ConnectionPool.getConnection()
for record in iter:
If the above is not possible, how do I speed up my operations? The only alternative I can think off is maintaining a single connection on the driver program, collecting the data from all workers and then fetching/uploading the same. This would decrease the number of times I need to connect to cloudant, but would take away the distributed fetching/publishing architecture.
I'm writing a program which periodically dumps old data from a RethinkDB database into a file and removes it from the database. Currently, the data is dumped into a single file which grows without limit. I'd like to change this so that the maximum file size is, say, 250 Mb, and the program starts to write to a new output file just before this size is exceeded.
It seems like Python's RotatingFileHandler class for loggers does approximately what I want; however, I'm not sure whether logging can be applied to any JSON-dumpable object or just to strings.
Another possible approach would be to use (a variant of) Mike Pennington's
RotatingFile class (see python: outfile to another text file if exceed certain file size).
Which of these approaches is likely to be the most fruitful?
For reference, my current program is as follows:
import os
import sys
import json
import rethinkdb as r
import pytz
from datetime import datetime, timedelta
import schedule
import time
import functools
from iclib import RethinkDB
import msgpack
''' The purpose of the Controller is to periodically archive data from the "sensor_data" table so that it does not grow without limit.'''
class Controller(RethinkDB):
def __init__(self, db_address=(os.environ['DB_ADDR'], int(os.environ['DB_PORT'])), db_name=os.environ['DB_NAME']):
super(Controller, self).__init__(db_address=db_address, db_name=db_name) # Initialize the IperCronComponent with the default logger name (in this case, "Controller")
self.db_table = RethinkDB.SENSOR_DATA_TABLE # The table name is "sensor_data" and is stored as a class variable in RethinkDBMixIn
def generate_archiving_query(self, retention_period=timedelta(days=3)):
expiry_time = r.now() - retention_period.total_seconds() # Timestamp before which data is to be archived
if "timestamp" in r.table(self.db_table).index_list().run(self.db): # If "timestamp" is a secondary index
beginning_of_time = r.time(1400, 1, 1, 'Z') # The minimum time of a ReQL time object (i.e., the year 1400 in the UTC timezone)
data_to_archive = r.table(self.db_table).between(beginning_of_time, expiry_time, index="timestamp") # Generate query using "between" (faster)
data_to_archive = r.table(self.db_table).filter(r.row['timestamp'] < expiry_time) # Generate the same query using "filter" (slower, but does not require "timestamp" to be a secondary index)
return data_to_archive
def archiving_job(self, data_to_archive=None, output_file="archived_sensor_data.json"):
if data_to_archive is None:
data_to_archive = self.generate_archiving_query() # By default, the call the "generate_archiving_query" function to generate the query
old_data = data_to_archive.run(self.db, time_format="raw") # Without time_format="raw" the output does not dump to JSON
with open(output_file, 'a') as f:
ids_to_delete = []
for item in old_data:
print item
# msgpack.dump(item, f)
json.dump(item, f)
f.write('\n') # Separate each document by a new line
r.table(self.db_table).get_all(r.args(ids_to_delete)).delete().run(self.db) # Delete based on ID. It is preferred to delete the entire batch in a single operation rather than to delete them one by one in the for loop.
def test_job_1():
db_name = "ipercron"
table_name = "sensor_data"
port_offset = 1 # To avoid interference of this testing program with the main program, all ports are initialized at an offset of 1 from the default ports using "rethinkdb --port_offset 1" at the command line.
conn = r.connect("localhost", 28015 + port_offset)
import rethinkdb_add_data
controller = Controller(db_address=("localhost", 28015+port_offset))
archiving_job = functools.partial(controller.archiving_job, data_to_archive=controller.generate_archiving_query())
return archiving_job
if __name__ == "__main__":
archiving_job = test_job_1()
while True:
It is not completely 'runnable' from the part shown, but the key point is that I would like to replace the line
json.dump(item, f)
with a similar line in which f is a rotating, and not fixed, file object.
Following Stanislav Ivanov, I used json.dumps to convert each RethinkDB document to a string and wrote this to a RotatingFileHandler:
import os
import sys
import json
import rethinkdb as r
import pytz
from datetime import datetime, timedelta
import schedule
import time
import functools
from iclib import RethinkDB
import msgpack
import logging
from logging.handlers import RotatingFileHandler
from random_data_generator import RandomDataGenerator
''' The purpose of the Controller is to periodically archive data from the "sensor_data" table so that it does not grow without limit.'''
os.environ['DB_ADDR'] = 'localhost'
os.environ['DB_PORT'] = '28015'
os.environ['DB_NAME'] = 'ipercron'
class Controller(RethinkDB):
def __init__(self, db_address=None, db_name=None):
if db_address is None:
db_address = (os.environ['DB_ADDR'], int(os.environ['DB_PORT'])) # The default host ("rethinkdb") and port (28015) are stored as environment variables
if db_name is None:
db_name = os.environ['DB_NAME'] # The default database is "ipercron" and is stored as an environment variable
super(Controller, self).__init__(db_address=db_address, db_name=db_name) # Initialize the instance of the RethinkDB class. IperCronComponent will be initialized with its default logger name (in this case, "Controller")
self.db_name = db_name
self.db_table = RethinkDB.SENSOR_DATA_TABLE # The table name is "sensor_data" and is stored as a class variable of RethinkDBMixIn
self.table = r.db(self.db_name).table(self.db_table)
self.archiving_logger = logging.getLogger("archiving_logger")
self.archiving_handler = RotatingFileHandler("archived_sensor_data.log", maxBytes=2000, backupCount=10)
def generate_archiving_query(self, retention_period=timedelta(days=3)):
expiry_time = r.now() - retention_period.total_seconds() # Timestamp before which data is to be archived
if "timestamp" in self.table.index_list().run(self.db):
beginning_of_time = r.time(1400, 1, 1, 'Z') # The minimum time of a ReQL time object (namely, the year 1400 in UTC)
data_to_archive = self.table.between(beginning_of_time, expiry_time, index="timestamp") # Generate query using "between" (faster, requires "timestamp" to be a secondary index)
data_to_archive = self.table.filter(r.row['timestamp'] < expiry_time) # Generate query using "filter" (slower, but does not require "timestamp" to be a secondary index)
return data_to_archive
def archiving_job(self, data_to_archive=None):
if data_to_archive is None:
data_to_archive = self.generate_archiving_query() # By default, the call the "generate_archiving_query" function to generate the query
old_data = data_to_archive.run(self.db, time_format="raw") # Without time_format="raw" the output does not dump to JSON or msgpack
ids_to_delete = []
for item in old_data:
print item
self.table.get_all(r.args(ids_to_delete)).delete().run(self.db) # Delete based on ID. It is preferred to delete the entire batch in a single operation rather than to delete them one by one in the for-loop.
def dump(self, item, mode='json'):
if mode == 'json':
dump_string = json.dumps(item)
elif mode == 'msgpack':
dump_string = msgpack.packb(item)
def populate_database(db_name, table_name, conn):
if db_name not in r.db_list().run(conn):
r.db_create(db_name).run(conn) # Create the database if it does not yet exist
if table_name not in r.db(db_name).table_list().run(conn):
r.db(db_name).table_create(table_name).run(conn) # Create the table if it does not yet exist
r.db(db_name).table(table_name).delete().run(conn) # Empty the table to start with a clean slate
# Generate random data with timestamps uniformly distributed over the past 6 days
random_data_time_interval = timedelta(days=6)
start_random_data = datetime.utcnow().replace(tzinfo=pytz.utc) - random_data_time_interval
random_generator = RandomDataGenerator(seed=0)
packets = random_generator.packets(N=100, start=start_random_data)
# print packets
print "Adding data to the database..."
if __name__ == "__main__":
db_name = "ipercron"
table_name = "sensor_data"
port_offset = 1 # To avoid interference of this testing program with the main program, all ports are initialized at an offset of 1 from the default ports using "rethinkdb --port_offset 1" at the command line.
host = "localhost"
port = 28015 + port_offset
conn = r.connect(host, port) # RethinkDB connection object
populate_database(db_name, table_name, conn)
# import rethinkdb_add_data
controller = Controller(db_address=(host, port))
archiving_job = functools.partial(controller.archiving_job, data_to_archive=controller.generate_archiving_query()) # This ensures that the query is only generated once. (This is sufficient since r.now() is re-evaluated every time a connection is made).
while True:
In this context the RethinkDB class does little other than define the class variable SENSOR_DATA_TABLE and the RethinkDB connection, self.db = r.connect(self.address[0], self.address[1]). This is run together with a module for generating fake data, random_data_generator.py:
import random
import faker
from datetime import datetime, timedelta
import pytz
import rethinkdb as r
class RandomDataGenerator(object):
def __init__(self, seed=None):
self._seed = seed
self._random = random.Random()
self.fake = faker.Faker()
def __getattr__(self, x):
return getattr(self._random, x)
def name(self):
return self.fake.name()
def datetime(self, start=None, end=None):
if start is None:
start = datetime(2000, 1, 1, tzinfo=pytz.utc) # Jan 1st 2000
if end is None:
end = datetime.utcnow().replace(tzinfo=pytz.utc)
if isinstance(end, datetime):
dt = end - start
elif isinstance(end, timedelta):
dt = end
assert isinstance(dt, timedelta)
random_dt = timedelta(microseconds=self._random.randrange(int(dt.total_seconds() * (10 ** 6))))
return start + random_dt
def packets(self, N=1, start=None, end=None):
return [{'name': self.name(), 'timestamp': self.datetime(start=start, end=end)} for _ in range(N)]
When I run controller it produces several rolled-over output logs, each at most 2 kB in size, as expected:
I'm trying to query global group variables set in Ansible. I seem to be getting an empty dictionary and I'm not sure what else I can do. My code looks like this:
def __init__(self, inventory_path=None):
self.loader = DataLoader()
self.variable_manager = VariableManager()
self.inventory = Inventory(loader=self.loader, variable_manager=self.variable_manager, host_list=inventory_path)
when I then try to get group vars as below:
inventory_asg_groups = filter(lambda g: 'asg' in g, self.inventory.groups)
for group in inventory_asg_groups:
I get an empty dictionary:
when I just do a:
I get this:
{'ansible_python_interpreter': '/usr/local/opt/python/bin/python2.7', 'ansible_connection': 'local'}
I know the inventory is being loaded, since I list all the groups in the inventory. How do I get the variables listed in group_vars/all via the python ansible api?
This actually works if you have group-specific variables defined (group_vars/.yml). This is because get_group_vars directly read 'all' you can use self.inventory.get_group_vars(self.inventory.get_group('all')).
Full example below:
# hosts
setting_something: "5"
setting_something: "6500"
from ansible.parsing.dataloader import DataLoader
from ansible.vars import VariableManager
from ansible.inventory import Inventory
class AnsibleLoader:
def __init__(self, inventory_path=None):
self.loader = DataLoader()
self.variable_manager = VariableManager()
self.inventory = Inventory(loader=self.loader, variable_manager=self.variable_manager, host_list=inventory_path)
inventory_asg_groups = filter(lambda g: 'asg' in g, self.inventory.groups)
for group in inventory_asg_groups:
print('vars for: %s' % group)
[u'asg_2', u'asg_1']
vars for: asg_2
vars for: asg_1
{u'setting_something': u'6500'}
{u'setting_something': u'5'}
So it prints values for asg_1 as it has file in group_vars, but not for asg_2. Last line is accessing 'all'
All done with ansible 2.0, not sure which version you use.
Let's assume I have the following file structure:
foo = []
bar = []
abc = "def"
import data
# do something here #
# a = ...
print a
# ['foo', 'bar', 'abc']
I need to get all the variables defined in data.py file. How can I achieve that? I could use dir(), but it returns all the attributes of the module including __name__ and so on.
print [item for item in dir(adfix) if not item.startswith("__")]
Is usually the recipe for doing this, but it begs the question.
# coding: utf-8
__author__ = 'spouk'
def get_book_variable_module_name(module_name):
module = globals().get(module_name, None)
book = {}
if module:
book = {key: value for key, value in module.__dict__.iteritems() if not (key.startswith('__') or key.startswith('_'))}
return book
import config
book = get_book_variable_module_name('config')
for key, value in book.iteritems():
print "{:<30}{:<100}".format(key, value)
example config
# coding: utf-8
__author__ = 'spouk'
import os
_basedir = os.path.abspath(os.path.dirname(__file__))
# database section MYSQL section
DBHOST = 'localhost'
DBNAME = 'simple_domain'
DBPORT = 3306
DBUSER = 'root'
DBPASS = 'root'
# global section
DEBUG = True
HOSTNAME = 'simpledomain.com'
HOST = ''
PORT = 3000
ADMINS = frozenset(['admin#localhost'])
SECRET_KEY = 'dfg45DFcx4rty'
CSRF_SESSION_KEY = "simplekey"
result function
/usr/local/bin/python2 /home/spouk/develop/python/2015/utils_2015/parse_config_py.py
os <module 'os' from '/usr/local/lib/python2.7/os.pyc'>
DBHOST localhost
HOSTNAME simpledomain.com
PORT 3000
ADMINS frozenset(['admin#localhost'])
SECRET_KEY dfg45DFcx4rty
DBNAME simple_domain
Process finished with exit code 0
Enjoy, dude. :)
I have to make a dictionary of these variables. I used this code.
print({item:getattr(my_module, item) for item in dir(my_module) if not item.startswith("__") and not item.endswith("__")})
This is the version I wrote for python 3.7 (it excludes the internal dunder methods via the condition in the comprehension)
print([v for v in dir(data) if v[:2] != "__"])
A longer but complete working example is below:
"""an example of a config file whose variables may be accessed externally"""
# Module variables
server_address = ""
server_port = 8010
server_to_client_port = 8020
client_to_server_port = 8030
client_buffer_length = 4096
server_buffer_length = 2048
def printVariables(variable_names):
"""Renders variables and their values on the terminal."""
max_name_len = max([len(k) for k in variable_names])
max_val_len = max([len(str(globals()[k])) for k in variable_names])
for k in variable_names:
print(f' {k:<{max_name_len}}: {globals()[k]:>{max_val_len}}')
if __name__ == "__main__":
ks = [k for k in dir() if (k[:2] != "__" and not callable(globals()[k]))]
The above code outputs:
an example of a config file whose variables may be accessed externally
client_buffer_length : 4096
client_to_server_port: 8030
server_address :
server_buffer_length : 2048
server_port : 8010
server_to_client_port: 8020
I offer my solution. It is convenient in that it allows you to display variables from any imported module.
If you do not specify the name of the module, then the list of variables of the current module is displayed.
import sys
def print_settings(module_name=None):
module_name = sys.modules[__name__] if not module_name else module_name
variables = [
(key, value)
for (key, value) in vars(module_name).items()
if (type(value) == str or type(value) == int or type(value) == float)
and not key.startswith("_")
for (key, value) in variables:
print(f"{key: <20} {value}")
If you need the variable and the value assigned to it then
import data
for name ,values in vars(data).items():
print(name, values)
You can choose to store name (all the variable names in the script) or the value attached to it .
Here's a roundabout way, if you prefer to be more explicit:
a = [
foo := [],
bar := [],
abc := "def",
import data
for vars in dir():
if vars.startswith("var"):
print vars