How to set vars into ansible inventory?

How to set vars into ansible inventory? - python

In playbook I'm using variable {{excluded_service}}. I want to run ansible playbook from python and provide this variable. And I can't use external inventory script to provide this variable.
I am using to create inventory:
hosts = ["127.0.0.1"]
inventory=ansible.inventory.Inventory(hosts)
but I don't understand where I can add value of variable?
My code, that works with external inventory script:
import sys
import os
import stat
import json
import ansible.playbook
import ansible.constants as C
import ansible.utils.template
from ansible import errors
from ansible import callbacks
from ansible import utils
from ansible.color import ANSIBLE_COLOR, stringc
from ansible.callbacks import display
playbook="/opt/RDE/3p/ansible/loop/testloop.yml"
inventory="/opt/RDE/3p/ansible/loop/lxc.py"
connection="local"
timeout=10
subset=None
options={'subset': None, 'ask_pass': False, 'sudo': False, 'private_key_file': None, 'syntax': None, 'skip_tags': None, 'diff': False, 'check': False, 'remote_user': 'root', 'listtasks': None, 'inventory': '/opt/RDE/3p/ansible/loop/lxc.py', 'forks': 5, 'listhosts': None, 'start_at': None, 'tags': 'all', 'step': None, 'sudo_user': None, 'ask_sudo_pass': False, 'extra_vars': [], 'connection': 'smart', 'timeout': 10, 'module_path': None}
sshpass = None
sudopass = None
extra_vars = {}
def colorize(lead, num, color):
""" Print 'lead' = 'num' in 'color' """
if num != 0 and ANSIBLE_COLOR and color is not None:
return "%s%s%-15s" % (stringc(lead, color), stringc("=", color), stringc(str(num), color))
else:
return "%s=%-4s" % (lead, str(num))
def hostcolor(host, stats, color=True):
if ANSIBLE_COLOR and color:
if stats['failures'] != 0 or stats['unreachable'] != 0:
return "%-37s" % stringc(host, 'red')
elif stats['changed'] != 0:
return "%-37s" % stringc(host, 'yellow')
else:
return "%-37s" % stringc(host, 'green')
return "%-26s" % host
inventory = ansible.inventory.Inventory(options['inventory'])
hosts = ["127.0.0.1"]
#inventory=ansible.inventory.Inventory(hosts)
inventory.subset(options['subset'])
if len(inventory.list_hosts()) == 0:
raise errors.AnsibleError("provided hosts list is empty")
inventory.set_playbook_basedir(os.path.dirname(playbook))
stats = callbacks.AggregateStats()
playbook_cb = callbacks.PlaybookCallbacks(verbose=utils.VERBOSITY)
if options['step']:
playbook_cb.step = options['step']
if options['start_at']:
playbook_cb.start_at = options['start_at']
runner_cb = callbacks.PlaybookRunnerCallbacks(stats, verbose=utils.VERBOSITY)
pb = ansible.playbook.PlayBook(
playbook=playbook,
module_path=None,
inventory=inventory,
forks=options['forks'],
remote_user=options['remote_user'],
remote_pass=sshpass,
callbacks=playbook_cb,
runner_callbacks=runner_cb,
stats=stats,
timeout=options['timeout'],
transport=options['connection'],
sudo=options['sudo'],
sudo_user=options['sudo_user'],
extra_vars=extra_vars,
private_key_file=options['private_key_file'],
check=options['check'],
diff=options['diff']
)
playnum = 0
failed_hosts = []
unreachable_hosts = []
try:
print pb.run()
hosts = sorted(pb.stats.processed.keys())
print hosts
display(callbacks.banner("PLAY RECAP"))
playbook_cb.on_stats(pb.stats)
for h in hosts:
t = pb.stats.summarize(h)
if t['failures'] > 0:
failed_hosts.append(h)
if t['unreachable'] > 0:
unreachable_hosts.append(h)
retries = failed_hosts + unreachable_hosts
if len(retries) > 0:
filename = pb.generate_retry_inventory(retries)
if filename:
display(" to retry, use: --limit #%s\n" % filename)
for h in hosts:
t = pb.stats.summarize(h)
display("%s : %s %s %s %s" % (
hostcolor(h, t),
colorize('ok', t['ok'], 'green'),
colorize('changed', t['changed'], 'yellow'),
colorize('unreachable', t['unreachable'], 'red'),
colorize('failed', t['failures'], 'red')),
screen_only=True
)
display("%s : %s %s %s %s" % (
hostcolor(h, t, False),
colorize('ok', t['ok'], None),
colorize('changed', t['changed'], None),
colorize('unreachable', t['unreachable'], None),
colorize('failed', t['failures'], None)),
log_only=True
)
except Exception as e:
print ("!!!!!!!ERROR: %s" % e)

Specify the vars in a host_vars file?
E.g. create a YAML file named /etc/ansible/host_vars/localhost with the vars you wanna put there.
I don't know, yet, how to specify it in Python code itself.
--- update ---
After a quick look the code, I don't think ansible supports specifying host variables when you specify hosts via host_list parameter. Hacks I can think of (if you must do this in python code) are:
[mis]use extra_vars parameter.
Write an inventory file (YAML or executable that just prints required JSON) from your python code and pass it's path as parameter inventory.
HTH

Related

Comparing id betwen config value and API output

I want to check if the fields 'id' and 'folder_id' of api_response equal certain values (here, respectively 25 and 17). If it does, return true, else return false. My problem is that my script returns false when it should return true because the fields id and folder_id are correct.
Here is my code:
SendInBlueAPI = config.get('Config','SendInBlueAPI')
SendInBlueFolderID = config.get('Config','SendInBlueFolderID')
SendInBlueListID = config.get('Config','SendInBlueListID')
## Checking if list exists in SendInBlue
configuration = sib_api_v3_sdk.Configuration()
configuration.api_key['api-key'] = '%s' % (SendInBlueAPI)
api_instance = sib_api_v3_sdk.ListsApi(sib_api_v3_sdk.ApiClient(configuration))
list_id = SendInBlueListID
try:
api_response = api_instance.get_list(list_id)
if api_response.id == SendInBlueListID and api_response.folder_id == SendInBlueFolderID:
print(True)
else:
print(False)
except ApiException as e:
print("Exception when calling ListsApi->get_list: %s\n" % e)
This is the output of api_response:
{'campaign_stats': None,
'created_at': '2023-01-16T10:07:47.000+01:00',
'dynamic_list': False,
'folder_id': 17,
'id': 25,
'name': 'My list',
'total_blacklisted': 2,
'total_subscribers': 244,
'unique_subscribers': 246}
And here is my config file :
[Config]
SendInBlueAPI = MY_API_KEY
SendInBlueFolderID = 17
SendInBlueListID = 25

is there a way to automate VMware snapshot deletion by setting a schedule through Ansible or rest API

I am trying to figure out if there is a way to automate VMware snapshot deletion through Ansible.
I have found vmware_guest_powerstate.py"to be closest to it and tried to modified it but it's failing with "Failed to create scheduled task present as specifications given are invalid: A specified parameter was not correct: spec.action"
pstate = {
'present': vim.VirtualMachine.CreateSnapshot,
'absent': vim.VirtualMachine.RemoveAllSnapshots,
}
dt = ""
try:
dt = datetime.strptime(scheduled_at, "%d/%m/%Y %H:%M")
except ValueError as e:
module.fail_json(
msg="Failed to convert given date and time string to Python datetime object,"
"please specify string in 'dd/mm/yyyy hh:mm' format: %s"
% to_native(e)
)
schedule_task_spec = vim.scheduler.ScheduledTaskSpec()
schedule_task_name = module.params["schedule_task_name"] or "task_%s" % str(
randint(10000, 99999)
)
schedule_task_desc = module.params["schedule_task_description"]
if schedule_task_desc is None:
schedule_task_desc = (
"Schedule task for vm %s for "
"operation %s at %s"
% (vm.name, scheduled_at)
)
schedule_task_spec.name = schedule_task_name
schedule_task_spec.description = schedule_task_desc
schedule_task_spec.scheduler = vim.scheduler.OnceTaskScheduler()
schedule_task_spec.scheduler.runAt = dt
schedule_task_spec.action = vim.action.MethodAction()
schedule_task_spec.action.name = pstate[module.params['state']]
schedule_task_spec.enabled = module.params["schedule_task_enabled"]

Suggest trying "vim.VirtualMachine.CreateSnapshot_Task" and "RemoveAllSnapshots_Task" instead of "vim.VirtualMachine.CreateSnapshot" and "vim.VirtualMachine.RemoveAllSnapshots" to see if that works.
pstate = {'present': vim.VirtualMachine.CreateSnapshot_Task,'absent': vim.VirtualMachine.RemoveAllSnapshots_Task}
....
schedule_task_spec.name = schedule_task_name
schedule_task_spec.description = schedule_task_desc
schedule_task_spec.scheduler = vim.scheduler.OnceTaskScheduler()
schedule_task_spec.scheduler.runAt = dt
schedule_task_spec.action = vim.action.MethodAction()
schedule_task_spec.action.name = pstate[module.params['state']]
schedule_task_spec.enabled = module.params["schedule_task_enabled"]
print(schedule_task_spec.action)

NoRegionError Boto3

Is there a way to run boto3 module without intrinsically defining a region? I am trying to run a script to confirm tags have been added to cloudformation AWS stacks. I want it to run from whatever region would be currently selected inside of my default connection. I'm not sure if I need to write something for this. I am also curious if it needs to be written to just say the region. If that's the case is it possible to check ALL regions without it slowing down?
import boto3
cf = boto3.client('cloudformation')
def validate_stack(stack_id):
rsp = cf.describe_stacks(StackName=stack_id)
for stack in rsp['Stacks']:
has_product = False
has_service = False
has_team = False
has_owner = False
for tag in stack['Tags']:
if tag['Key'] == 'Product':
has_product = True
if tag['Key'] == 'Service':
has_service = True
if tag['Key'] == 'Team':
has_team = True
if tag['Key'] == 'Owner':
has_owner = True
last_time = stack.get('LastUpdatedTime', stack['CreationTime'])
if (has_product == False) or (has_service == False) or (has_team == False) or (has_owner == False):
print('last updated: {5}, has_product={1}, has_service={2}, has_team={3}, has_owner={4} {0}'.format(stack_id, has_product, has_service, has_team, has_owner, last_time))
def check_cloudformation(deployment_id):
list_rsp = cf.list_stacks(
StackStatusFilter=[
'CREATE_COMPLETE',
'UPDATE_COMPLETE'
]
)
deployment_substring = '-{0}-'.format(deployment_id)
while True:
for summary in list_rsp['StackSummaries']:
if deployment_substring in summary['StackName']:
validate_stack(summary['StackId'])
next_token = list_rsp.get('NextToken')
if next_token == None:
break
list_rsp = cf.list_stacks(
StackStatusFilter=[
'CREATE_COMPLETE',
'UPDATE_COMPLETE'
],
NextToken=next_token
)

How do I achieve the throughput of 50k/sec in inserting my data in Cassandra while reading input from a csv file?

My aim is to increase the throughput of versioning data in Cassandra. I have used concurrent reads and writes and have also increased the chunk size that my code reads from the file. My machine is 16gb with 8 cores and yes, I have changed Cassandra's yaml file for 10k concurrent reads and writes and when timed it, I found out that 10000 writes/reads takes less than a second.
My minimal, viable code is:
import json
import logging
import os
import sys
from datetime import datetime
from hashlib import sha256, sha512, sha1
import pandas as pd
from cassandra import ConsistencyLevel, WriteTimeout
from cassandra.cluster import (EXEC_PROFILE_DEFAULT, BatchStatement, Cluster,
ExecutionProfile)
from cassandra.concurrent import (execute_concurrent,
execute_concurrent_with_args)
from cassandra.query import SimpleStatement, dict_factory
class PythonCassandraExample:
def __init__(self, file_to_be_versioned, working_dir=os.getcwd(), mode='append'):
self.cluster = None
self.session = None
self.keyspace = None
self.log = None
self.mode = mode
self.file_to_be_versioned = file_to_be_versioned
self.insert_patch = []
self.delete_patch = []
self.update_patch = []
self.working_dir = working_dir
def __del__(self):
self.cluster.shutdown()
def createsession(self):
profile = ExecutionProfile(
row_factory=dict_factory,
request_timeout=6000
)
self.cluster = Cluster(
['localhost'],
connect_timeout=50,
execution_profiles={
EXEC_PROFILE_DEFAULT: profile
}
)
self.session = self.cluster.connect(self.keyspace)
def getsession(self):
return self.session
# How about Adding some log info to see what went wrong
def setlogger(self):
log = logging.getLogger()
log.setLevel('INFO')
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(name)s: %(message)s"))
log.addHandler(handler)
self.log = log
# Create Keyspace based on Given Name
def handle_error(self, exception):
self.log.error("Failed to fetch user info: %s", exception)
def createkeyspace(self, keyspace):
"""
:param keyspace: The Name of Keyspace to be created
:return:
"""
# Before we create new lets check if exiting keyspace; we will drop that and create new
self.log.info("creating keyspace...")
self.session.execute("""
CREATE KEYSPACE IF NOT EXISTS %s
WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': '1' }
""" % keyspace)
self.log.info("setting keyspace...")
self.keyspace = keyspace
self.session.set_keyspace(self.keyspace)
def create_table_and_set_version(self, table_name):
self.table_name = table_name.lower()
table_select_query = "SELECT table_name FROM system_schema.tables WHERE keyspace_name='{}' AND table_name='{}'".format(
self.keyspace, self.table_name)
print(table_select_query)
table_exists = self.session.execute(table_select_query).one()
self.log.info("Table exists: {}".format(table_exists))
if table_exists:
self.log.info(
"Datapackage already exists! Checking the last version")
self.version = self.session.execute(
"SELECT version FROM {} LIMIT 1".format(self.table_name)).one()
self.log.info(
"The version fetched is: {} of type: {}".format(
self.version, type(self.version)
)
)
if not self.version:
self.version = 0
else:
self.version = self.version['version']
else:
self.log.info("Table didn't exist!")
self.version = 0
self.target_version = int(str(self.version)) + 1
self.log.info(
"Current and candidate versions are: {}, {}".format(
self.version,
self.target_version
)
)
# c_sql = "CREATE TABLE IF NOT EXISTS {} (id varchar, version int, row varchar, row_hash varchar, PRIMARY KEY(id, version)) with clustering order by (version desc)".format(
# self.table_name)
c_sql = "CREATE TABLE IF NOT EXISTS {} (id varchar, version int, row_hash varchar, PRIMARY KEY(version, id))".format(
self.table_name
)
self.session.execute(c_sql)
self.log.info("DP Table Created !!!")
self.log.info("Current and candidate versions are: {}, {}".format(
self.version, self.target_version))
def push_to_update_patch(self, update_patch_file, last_patch=False):
if len(self.update_patch) >= 10000:
with open(update_patch_file, mode='a') as json_file:
json_file.writelines(
self.update_patch
)
del self.update_patch[:]
if last_patch is True and len(self.update_patch) > 0:
with open(update_patch_file, mode='a') as json_file:
json_file.writelines(
self.update_patch
)
del self.update_patch[:]
def push_to_insert_patch(self, insert_patch_file, last_patch=False):
if len(self.insert_patch) >= 10000:
with open(insert_patch_file, mode='a') as json_file:
json_file.writelines(
self.insert_patch
)
del self.insert_patch[:]
if last_patch is True and len(self.update_patch) > 0:
with open(insert_patch_file, mode='a') as json_file:
json_file.writelines(
self.insert_patch
)
del self.insert_patch[:]
def push_to_delete_patch(self, delete_patch_file, last_patch=False):
if len(self.delete_patch) >= 10000:
with open(delete_patch_file, mode='a') as json_file:
json_file.writelines(
self.delete_patch
)
del self.delete_patch[:]
if last_patch is True and len(self.delete_patch) > 0:
with open(delete_patch_file, mode='a') as json_file:
json_file.writelines(
self.delete_patch
)
del self.delete_patch[:]
def push_to_patch(self, key, value, mode='update'):
return
if key is None or value is None:
raise ValueError(
"Key or value or both not specified for making a patch. Exiting now."
)
data = {}
data["id"] = str(key)
data["data"] = json.dumps(value, default=str)
# convert dict to json str so that the patch is a list of line jsons.
data = json.dumps(data, default=str)
json_patch_file = os.path.join(
self.working_dir,
"version_{}_{}.json".format(
self.target_version, mode
)
)
if mode == 'update':
self.update_patch.append(
data + "\n"
)
self.push_to_update_patch(
json_patch_file
)
if mode == 'insert':
self.insert_patch.append(
data + "\n"
)
self.push_to_insert_patch(
json_patch_file
)
if mode == 'delete':
self.delete_patch.append(
data + "\n"
)
self.push_to_delete_patch(
json_patch_file
)
def clone_version(self):
if self.mode == 'replace':
return
self.log.info("Cloning version")
start_time = datetime.utcnow()
if self.version == 0:
return
insert_sql = self.session.prepare(
(
"INSERT INTO {} ({}, {}, {}) VALUES (?,?,?)"
).format(
self.table_name, "id", "version", "row_hash"
)
)
futures = []
current_version_query = "SELECT id, row_hash FROM {} WHERE version={}".format(
self.table_name, self.version
)
current_version_rows = self.session.execute(
current_version_query
)
for current_version_row in current_version_rows:
params = (
current_version_row['id'],
self.target_version,
current_version_row['row_hash']
)
futures.append(
(
insert_sql,
params
)
)
self.log.info(
"Time taken to clone the version is: {}".format(
datetime.utcnow() - start_time
)
)
def hash_string(self, value):
return (sha1(str(value).encode('utf-8')).hexdigest())
def hash_row(self, row):
row_json = json.dumps(row, default=str)
return (self.hash_string(row_json))
def insert_data(self, generate_diff=False):
self.generate_diff = generate_diff
destination = self.file_to_be_versioned
chunksize = 100000
concurrency_value = 1000
patch_length_for_cql = chunksize
chunks = pd.read_csv(destination, chunksize=chunksize)
chunk_counter = 0
insert_sql = self.session.prepare(
(
"INSERT INTO {} ({}, {}, {}) VALUES (?,?,?)"
).format(
self.table_name, "id", "version", "row_hash"
)
)
select_sql = self.session.prepare(
(
"SELECT id, version, row_hash FROM {} WHERE version=? AND id=?"
).format(
self.table_name
)
)
futures = []
check_for_patch = [] #this list comprises rows with ids and values for checking whether its an update/insert
rows_for_checking_patch = []
start_time = datetime.utcnow()
for df in chunks:
rows_for_checking_patch = df.values.tolist()
chunk_counter += 1
df["row_hash"] = df.apply(
self.hash_row
)
df["key"] = df["column_test_3"].apply(
self.hash_string
)
keys = list(df["key"])
row_hashes = list(df["row_hash"])
start_time_de_params = datetime.utcnow()
for i in range(chunksize):
row_check = None
params = (
str(keys[i]),
self.target_version,
str(row_hashes[i])
)
check_for_patch_params = (
self.version,
str(keys[i])
)
check_for_patch.append(
(
select_sql,
check_for_patch_params
)
)
futures.append(
(
insert_sql,
params
)
)
self.log.info("Time for params: {}".format(datetime.utcnow() - start_time_de_params))
if len(check_for_patch) >= patch_length_for_cql:
start_time_de_update = datetime.utcnow()
results = execute_concurrent(
self.session, check_for_patch, concurrency=concurrency_value, raise_on_first_error=False
)
self.log.info("Time for just the query: {}".format(datetime.utcnow() - start_time_de_update))
row_counter_for_patch = 0
for (success, result) in results:
if not result:
self.push_to_patch(
keys[row_counter_for_patch],
rows_for_checking_patch[row_counter_for_patch],
mode='insert'
)
row_counter_for_patch += 1
continue
if not success:
# result will be an Exception
self.log.error("Error has occurred in insert cql")
self.handle_error(result)
id_to_be_compared = result[0]["id"]
row_hash_to_be_compared = result[0]["row_hash"]
if (row_hash_to_be_compared != row_hashes[row_counter_for_patch]):
self.push_to_patch(
id_to_be_compared,
rows_for_checking_patch[row_counter_for_patch]["row"],
mode='update'
)
row_counter_for_patch += 1
del check_for_patch[:]
del rows_for_checking_patch[:]
row_counter_for_patch = 0
self.log.info("Time for check patch: {}".format(
datetime.utcnow() - start_time_de_update
))
if (len(futures) >= patch_length_for_cql):
start_time_de_insert = datetime.utcnow()
results = execute_concurrent(
self.session, futures, concurrency=concurrency_value, raise_on_first_error=False
)
for (success, result) in results:
if not success:
# result will be an Exception
self.log.error("Error has occurred in insert cql")
self.handle_error(result)
del futures[:]
self.log.info("Time for insert patch: {}".format(
datetime.utcnow() - start_time_de_insert
))
self.log.info(chunk_counter)
# self.log.info("This chunk got over in {}".format(datetime.utcnow() - start_time))
if len(check_for_patch) > 0:
results = execute_concurrent(
self.session, check_for_patch, concurrency=concurrency_value, raise_on_first_error=False
)
row_counter_for_patch = 0
for (success, result) in results:
if not result:
self.push_to_patch(
rows_for_checking_patch[row_counter_for_patch]["id"],
rows_for_checking_patch[row_counter_for_patch]["row"],
mode='insert'
)
row_counter_for_patch += 1
continue
if not success:
# result will be an Exception
self.log.error("Error has occurred in insert cql")
self.handle_error(result)
id_to_be_compared = result[0]["id"]
row_hash_to_be_compared = result[0]["row_hash"]
if (row_hash_to_be_compared != rows_for_checking_patch[row_counter_for_patch]["row_hash"]):
self.push_to_patch(
id_to_be_compared,
rows_for_checking_patch[row_counter_for_patch]["row"],
mode='update'
)
row_counter_for_patch += 1
del check_for_patch[:]
del rows_for_checking_patch[:]
if len(futures) > 0: # in case the last dataframe has #rows < 10k.
results = execute_concurrent(
self.session, futures, concurrency=concurrency_value, raise_on_first_error=False
)
for (success, result) in results:
if not success:
self.handle_error(result)
del futures[:]
self.log.info(chunk_counter)
# Check the delete patch
if self.generate_diff is True and self.mode is 'replace' and self.version is not 0:
self.log.info("We got to find the delete patch!")
start_time = datetime.utcnow()
current_version_query = "SELECT id, row, row_hash FROM {} WHERE version={}".format(
self.table_name, self.version
)
current_version_rows = self.session.execute(
current_version_query
)
for current_version_row in current_version_rows:
row_check_query = "SELECT {} FROM {} WHERE {}={} AND {}='{}' ".format(
"id", self.table_name, "version", self.target_version, "id", current_version_row.id
)
row_check = self.session.execute(row_check_query).one()
if row_check is not None: # row exists in both version.
continue
self.push_to_patch(
current_version_row.id,
current_version_row.id,
mode="delete"
)
print("Complete insert's duration is: {}".format(
datetime.utcnow() - start_time)
)
# Calling last_patch for all remaining diffs
modes = [
'update',
'insert',
'delete'
]
for mode in modes:
json_patch_file = os.path.join(
self.working_dir,
"version_{}_{}.json".format(
self.target_version, mode
)
)
if mode == 'update':
self.push_to_update_patch(
json_patch_file,
last_patch=True
)
if mode == 'insert':
self.push_to_insert_patch(
json_patch_file,
last_patch=True
)
if mode == 'delete':
self.push_to_delete_patch(
json_patch_file,
last_patch=True
)
if __name__ == '__main__':
example1 = PythonCassandraExample(
file_to_be_versioned="hundred_million_eleven_columns.csv"
)
example1.createsession()
example1.setlogger()
example1.createkeyspace('sat_athena_one')
example1.create_table_and_set_version('five_hundred_rows')
example1.clone_version()
example1.insert_data(generate_diff=True)
I have a csv file of 100M rows and 11 cols. The script used to generate such a file is:
import csv
import sys
import os
import pandas as pd
file_name = "hundred_million_eleven_columns.csv"
rows_list = []
chunk_counter = 1
headers = [
"column_test_1",
"column_test_2",
"column_test_3",
"column_test_4",
"column_test_5",
"column_test_6",
"column_test_7",
"column_test_8",
"column_test_9",
"column_test_10",
"column_test_11",
]
file_exists = os.path.isfile(file_name)
with open(file_name, 'a') as csvfile:
writer = csv.DictWriter(csvfile, delimiter=',',
lineterminator='\n', fieldnames=headers)
if not file_exists:
writer.writeheader() # file doesn't exist yet, write a header
for i in range(100000000):
dict1 = [
i, i+1, i+2, i+3, i+4, i+5, i+6, i+7, i+8, i+9, i+10
]
# get input row in dictionary format
# key = col_name
rows_list.append(dict1)
if len(rows_list) == 100000:
df = pd.DataFrame(rows_list)
df.to_csv(file_name,
mode='a', index=False, header=False)
del rows_list[:]
del df
print(chunk_counter)
chunk_counter += 1
if len(rows_list) > 0:
df = pd.DataFrame(rows_list)
df.to_csv(file_name, mode='a', index=False, header=False)
del rows_list[:]
del df
print(chunk_counter)
chunk_counter += 1
My cassandra's yaml file is here

Make sure that your code can even generate that much at 50k. If you remove the execute's, can you even read the CSV and generate the sha that fast? A C* instance on that sized host with SSDs should be able to do 50k writes/sec but theres a lot going on outside of the C* writes that are likely part of the issue.
If your concurrent reads/writes are above 128 you are going to have some serious issues. On a system that can handle it 64 even is enough be able to go past 200k writes a sec. You are actually going to make things much worse with that high of a setting. There is no IO involved in that so as the documentation states, 8x your core count is a good value. I would even recommend lowering the concurrency your pushing from 10k to like 1024 or even lower. You can play with different settings to see how it impacts things.
Make sure python was compiled with cython on your install as its going to be dominated on the serialization otherwise. Speaking of the python driver is the slowest so keep that in mind.
Your blocking on the sha can be a majority of the time. Without perf tracing - just try it with a fixed value to see the difference.
"My machine" -- is this a single node cluster? If your throwing availability out the window might as well disable durable_writes on the keyspace to speed up the writes a bit. Missing heap settings but make sure you have a minimum of 8gb, even if this is a pretty small host cassandra needs memory. If not reading consider disabling the keycache and maybe disabling compactions while the job is running and then enable afterwards.

Comment recomends 8 * number of cores.
On the other hand, since writes are almost never IO bound, the ideal
number of "concurrent_writes" is dependent on the number of cores in
your system; (8 * number_of_cores) is a good rule of thumb.
64 is proper in 8core machine.
concurrent_reads: 64
concurrent_writes: 64
concurrent_counter_writes: 64
This limits are may recommended because there are many other io operations except normal IO. ex) writting commit log, caching, compaction, replication, view (if exist)
Some rules of thumb
disk_optimization_strategy: ssd // If your disk is hdd, chage value to spinning
use dedicated commit log disk. ssd recommended.
more disks = better performance

Does Python 3 have LDAP module?

I am porting some Java code to Python and we would like to use Python 3 but I can't find LDAP module for Python 3 in Windows.
This is forcing us to use 2.6 version and it is bothersome as rest of the code is already in 3.0 format.

You may use ldap3 module (formerly known as python3-ldap), it runs on python3 really well and requires no external C dependances. Also it can correctly handle both unicode and byte data in ldap records (in early versions there was a trouble with jpegPhoto field, now everything is fine)

If you're running this on Windows, you can get LDAP to work in Python 3.1 by using the ADO access method via Mark Hammond's PyWin32.
To test this, I installed ActiveState Python 3.1, then installed PyWin32 for Python 3.1
http://sourceforge.net/projects/pywin32/files/
I was then able to run LDAP queries using a module I wrote that is based on this LDAP code from the ActiveState Python Cookbook:
Recipe 511451: Dump all Active Directory Information using LDAP scripting by Manuel Garcia
http://code.activestate.com/recipes/511451/
although now that I look at it I realize I completely rewrote my module just using his code as an example.
Update
Here is my LDAPList module and another support module to convert user access bit codes into something a tiny bit more english-like:
LDAPList.py
# LDAPList.py
# Todd Fiske
# class to encapsulate accessing LDAP information
# 2009-03-18 first version
# 2010-01-04 updated for Python 3 (print functions, <> to !=)
import win32com.client
import UACCodes
ADS_SCOPE_SUBTREE = 2
class LDAPList():
def __init__(self, sContext):
self.Context = sContext # naming context, "DC=xyz,DC=org"
self.objectCategory = ""
self.objectClass = ""
self.FilterClause = ""
self.query = ""
self.cn = None
self.cm = None
self.rs = None
def SetCategory(self, sCategory):
self.objectCategory = sCategory
self.FilterClause = "where objectCategory = '%s'" % self.objectCategory
def SetClass(self, sClass):
self.objectClass = sClass
self.FilterClause = "where objectClass = '%s'" % self.objectClass
def open(self):
self.query = "select * from 'LDAP://%s' %s order by displayName" % (self.Context, self.FilterClause)
self.cn = win32com.client.Dispatch("ADODB.Connection")
self.cm = win32com.client.Dispatch("ADODB.Command")
self.cn.Open("Provider=ADsDSOObject")
self.cm.ActiveConnection = self.cn
self.cm.Properties["Page Size"] = 1000
self.cm.Properties["Searchscope"] = ADS_SCOPE_SUBTREE
self.cm.CommandText = self.query
self.rs = self.cm.Execute()[0]
def close(self):
if self.rs is not None:
self.rs.Close()
self.rs = None
if self.cm is not None:
self.cm = None
if self.cn is not None:
self.cn.Close()
self.cn = None
def count(self):
if self.rs is None:
return -2
return self.rs.RecordCount
def more(self):
if self.rs is None:
return False
return not self.rs.EOF
def GetObject(self):
if self.rs is None:
return None
return win32com.client.GetObject(self.rs.Fields["ADsPath"].Value)
def next(self):
if self.rs is None:
return
self.rs.MoveNext()
#----------
# helper functions
def NamingContext():
# return default naming context
root = win32com.client.GetObject("LDAP://RootDse")
return root.get("DefaultNamingContext")
def AccountControl(obj):
if obj.userAccountControl is not None:
return obj.userAccountControl
else:
return 0
def ConvertUAC(nUAC):
return UACCodes.ConvertUAC(nUAC)
def AccountActive(n):
return (n & UACCodes.ADS_UF_ACCOUNTDISABLE) != UACCodes.ADS_UF_ACCOUNTDISABLE
def GetCategory(obj):
# CN=Group,CN=Schema,CN=Configuration,DC=xyz,DC=org
s = obj.objectCategory
s = s.split(",")[0][3:]
return s
# s = "Group"
def GetGroups(obj):
"""
('CN=XYZ Staff Rockville,OU=Distribution Groups,DC=xyz,DC=org',
'CN=XYZ Staff,OU=Distribution Groups,DC=xyz,DC=org')
"""
if obj.memberOf is None:
return ""
if type(obj.memberOf)==type(()):
tGroups = obj.memberOf
else:
tGroups = (obj.memberOf,)
return tGroups
def GetNameParts(obj):
if obj.givenName is None:
sFirst = ""
else:
sFirst = obj.givenName
if obj.middleName is None:
sMiddle = ""
else:
sMiddle = obj.middleName
if obj.sn is None:
sLast = ""
else:
sLast = obj.sn
if sLast == "" and sFirst == "":
if obj.name is not None:
sName = obj.name
sName = sName[3:]
lParts = sName.split(" ")
if len(lParts) == 1:
"todo: split on embedded capital letter"
print("single-part name: %s" % sName)
sFirst = sName
else:
sLast = lParts[-1]
sFirst = " ".join(lParts[:-1])
return (sFirst, sMiddle, sLast)
def GetManager(obj):
if obj.manager is None:
return ""
else:
return obj.manager
#----------
# test
if __name__ == "__main__":
print
print("testing LDAPList class")
nc = NamingContext()
print("context =", nc)
ll = LDAPList(nc)
ll.SetCategory('user')
ll.open() # generates recordset
print("query = %s" % ll.query)
print("%d items" % ll.count())
n = 0
while (n < 10) and (ll.more()):
o = ll.GetObject() # return
nUAC = AccountControl(o)
print("%-30s %-30s %-30s %-40s %s" % (
o.displayName,
o.name,
o.sAMAccountName,
UACCodes.ConvertUAC(nUAC),
GetManager(o)
))
n += 1
ll.next()
ll.close()
###
UACCodes.py
# UACCodes.py
# Todd Fiske
# generated 2009-09-23 16:36:56 by BuildUACCodes.py
# updated 2010-01-04 for Python 3 (print functions)
# provide UAC constants, lookup list, and conversion function
import sys
# UAC Constants
ADS_UF_SCRIPT = 0x00000001
ADS_UF_ACCOUNTDISABLE = 0x00000002
ADS_UF_HOMEDIR_REQUIRED = 0x00000008
ADS_UF_LOCKOUT = 0x00000010
ADS_UF_PASSWD_NOTREQD = 0x00000020
ADS_UF_PASSWD_CANT_CHANGE = 0x00000040
ADS_UF_ENCRYPTED_TEXT_PASSWORD_ALLOWED = 0x00000080
ADS_UF_TEMP_DUPLICATE_ACCOUNT = 0x00000100
ADS_UF_NORMAL_ACCOUNT = 0x00000200
ADS_UF_INTERDOMAIN_TRUST_ACCOUNT = 0x00000800
ADS_UF_WORKSTATION_TRUST_ACCOUNT = 0x00001000
ADS_UF_SERVER_TRUST_ACCOUNT = 0x00002000
ADS_UF_DONT_EXPIRE_PASSWD = 0x00010000
ADS_UF_MNS_LOGON_ACCOUNT = 0x00020000
ADS_UF_SMARTCARD_REQUIRED = 0x00040000
ADS_UF_TRUSTED_FOR_DELEGATION = 0x00080000
ADS_UF_NOT_DELEGATED = 0x00100000
ADS_UF_USE_DES_KEY_ONLY = 0x00200000
ADS_UF_DONT_REQUIRE_PREAUTH = 0x00400000
ADS_UF_PASSWORD_EXPIRED = 0x00800000
ADS_UF_TRUSTED_TO_AUTHENTICATE_FOR_DELEGATION = 0x01000000
# UAC short name lookup list
lUACCodes = [
("ADS_UF_SCRIPT" , 0x00000001, "script"),
("ADS_UF_ACCOUNTDISABLE" , 0x00000002, "disabled"),
("ADS_UF_HOMEDIR_REQUIRED" , 0x00000008, "homedir"),
("ADS_UF_LOCKOUT" , 0x00000010, "lockout"),
("ADS_UF_PASSWD_NOTREQD" , 0x00000020, "pwnotreqd"),
("ADS_UF_PASSWD_CANT_CHANGE" , 0x00000040, "pwcantchange"),
("ADS_UF_ENCRYPTED_TEXT_PASSWORD_ALLOWED" , 0x00000080, "encryptedpw"),
("ADS_UF_TEMP_DUPLICATE_ACCOUNT" , 0x00000100, "dupaccount"),
("ADS_UF_NORMAL_ACCOUNT" , 0x00000200, "useracct"),
("ADS_UF_INTERDOMAIN_TRUST_ACCOUNT" , 0x00000800, "interdomain"),
("ADS_UF_WORKSTATION_TRUST_ACCOUNT" , 0x00001000, "workstation"),
("ADS_UF_SERVER_TRUST_ACCOUNT" , 0x00002000, "server"),
("ADS_UF_DONT_EXPIRE_PASSWD" , 0x00010000, "pwnoexpire"),
("ADS_UF_MNS_LOGON_ACCOUNT" , 0x00020000, "mnslogon"),
("ADS_UF_SMARTCARD_REQUIRED" , 0x00040000, "smartcard"),
("ADS_UF_TRUSTED_FOR_DELEGATION" , 0x00080000, "trustdeleg"),
("ADS_UF_NOT_DELEGATED" , 0x00100000, "notdeleg"),
("ADS_UF_USE_DES_KEY_ONLY" , 0x00200000, "deskey"),
("ADS_UF_DONT_REQUIRE_PREAUTH" , 0x00400000, "nopreauth"),
("ADS_UF_PASSWORD_EXPIRED" , 0x00800000, "pwexpired"),
("ADS_UF_TRUSTED_TO_AUTHENTICATE_FOR_DELEGATION", 0x01000000, "trustauth"),
]
# UAC conversion function
def ConvertUAC(nUAC):
s = ""
for c in lUACCodes:
if ((nUAC & c[1]) == c[1]):
s = s + c[2] + " "
return s
# test routine
if __name__ == "__main__":
print("UACCodes Test")
print("-------------")
for n in [0, 512, 514, 65535]:
print("%d = %s" % (n, ConvertUAC(n)))
print
for s in sys.argv[1:]:
n = int(s)
print("%d = %s" % (n, ConvertUAC(n)))
###
Both modules have some usage examples and should be fairly easy to figure out, but let me know if you have any questions or comments.

There is a Pure Python implementation of an LDAP client called Ldaptor. I don't think it's maintained though. If you really need it, you might be able to run 2to3 on this and port it.

This answer is no longer accurate; see below for other answers.
Sorry to break this on you, but I don't think there is a python-ldap for Python 3 (yet)...
That's the reason why we should keep active development at Python 2.6 for now (as long as most crucial dependencies (libs) are not ported to 3.0).

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.