Python utility for parsing blocks? - python

I have a file that starts something like:
databaseCons = {
main = {
database = "readable_name",
hostname = "hostdb1.serv.com",
instances= {
slaves = {
conns = "8"
}
}
maxconns = "5",
user = "user",
pass = "pass"
}
}
So, what I'd like to do is parse this out into a dict of sub-dicts, something like:
{'main': {'database': 'readable_name', 'hostname': 'hostdb1.serv.com', 'maxconns': '5', 'instances': {'slave': {'maxCount': '8'}}, 'user': 'user', 'pass': 'pass'}}
I think the above makes sense... but please feel free to edit this if it doesn't. Basically I want the equivalent of:
conns = '8'
slave = dict()
slave['maxCount'] = conns
instances = dict()
instances['slave'] = slave
database = 'readable_name'
hostname = 'hostdb1.serv.com'
maxconns = '5'
user = 'user'
pas = 'pass'
main = dict()
main['database'] = database
main['hostname'] = hostname
main['instances'] = instances
main['maxconns'] = maxconns
main['user'] = user
main['pass'] = pas
databaseCons = dict()
databaseCons['main'] = main
Are there any modules out there that can handle this sort of parsing? Even what I've suggested above looks messy.. there's got to be a better way I'd imagine.

Here is a pyparsing parser for your config file:
from pyparsing import *
def to_dict(t):
return {k:v for k,v in t}
series = Forward()
struct = Suppress('{') + series + Suppress('}')
value = quotedString.setParseAction(removeQuotes) | struct
token = Word(alphanums)
assignment = Group(token + Suppress('=') + value + Suppress(Optional(",")))
series << ZeroOrMore(assignment).setParseAction(to_dict)
language = series + stringEnd
def config_file_to_dict(filename):
return language.parseFile(filename)[0]
if __name__=="__main__":
from pprint import pprint
pprint(config_file_to_dict('config.txt'))

Related

Making dictionary more functional

how can i change the code, to make it more functional (change the for loop for the same result only to make it shorter in 2 sentences)?
def get_language_from_text():
"""
Simply return an array with the predicted languages.
"""
resultSet = []
inputdata = request.data
#print(inputdata)
inputdataparsed = json.loads(request.data)
array_of_sentences = inputdataparsed['inputdata']
for obj_in_array in array_of_sentences:
obj_in_array_tmp = obj_in_array
sentence = obj_in_array['TEXT']
obj_type = obj_in_array['TYPE']
obj_lang = obj_in_array['LANGUAGE']
prediction = detect(sentence)
result_to_safe = {"TEXT":sentence,
"TYPE": obj_type,
"LANGUAGE":obj_lang,
"PREDICTED_LANGUAGE": prediction}
resultSet.append(result_to_safe)
break
print(resultSet)
Your code is fine, it could use a bit of cleaning but that's okay.
You can shorten your loop to:
def make_dict(sentence_dict):
return {
"TEXT": sentence_dict["TEXT"],
"TYPE": sentence_dict["TYPE"],
"LANGUAGE": sentence_dict["LANGUAGE"],
"PREDICTED_LANGUAGE": detect(sentence_dict["TEXT"])
}
result_set = [ make_dict(sentence_dict) for sentence_dict in array_of_sentences ]
You can make this more functional by mapping make_dict over array_of_sentences as follows:
result_set = list(map(make_dict, array_of_sentences))

Parse csv containing array in one column in Node.js

We have a Python script that creates a csv file of enterprise data. One part of the enterprise data is a list of nacecodes (can be None) looking like this once its written to the csv file ['47299', '8690901', '4729903', '86909'] (It's one cell).
In a second script, this time written in Node.js, we parse the csv file with papaparse. We want the nacecodes to be an array but it's a string looking like "['47299', '8690901', '4729903', '86909']"
How can we parse this string to an array? I had found a possible solution by using JSON.parse but its given me a Unexpected token ' in JSON at position 1
Python script
class Enterprise:
def __init__(self):
self.enterprise_number = ''
self.vat_number = ''
self.nace_codes = set()
self.tel = ''
self.mobile = ''
self.email = ''
def to_json(self):
return {
'enterprise_number': self.enterprise_number if self.enterprise_number != '' else None,
'vat_number': self.vat_number if self.vat_number != '' else None,
'nace_codes': list(self.nace_codes) if len(self.nace_codes) > 0 else None
'tel': self.tel if self.tel != '' else None,
'mobile': self.mobile if self.mobile != '' else None,
'email': self.email if self.email != '' else None,
}
def read_data():
...
with open('enterprise_data.csv', 'w',) as file:
writer = csv.writer(file, delimiter=';')
writer.writerow(['enterprise_number', 'vat_number', 'name', 'nace_codes', 'type_of_enterprise', 'juridical_form', 'start_date', 'county', 'city', 'address', 'postal_code', 'box', 'group_part', 'group_number', 'tel', 'mobile', 'email', 'is_active'])
with open('data/enterprise_insert.csv') as file:
for line in islice(file, 1, None):
enterprise = Enterprise()
line = line.rstrip()
...
formatted_data = enterprise.to_json()
writer.writerow([formatted_data['enterprise_number'], formatted_data['vat_number'], formatted_data['nace_codes'], formatted_data['tel'], formatted_data['mobile'], formatted_data['email'])
Node.js script
const csvFilePath = 'data/enterprise_data.csv'
const readCSV = async (filePath) => {
const csvFile = fs.readFileSync(filePath);
const csvData = csvFile.toString();
return new Promise(resolve => {
Papa.parse(csvData, {
header: true,
skipEmptyLines: true,
transformHeader: header => header.trim(),
complete: results => {
console.log('Read', results.data.length, 'records.');
resolve(results.data);
}
});
});
};
const start = async () => {
try {
let parsedData = await readCSV(csvFilePath);
parsedData.map((row, i) => {
console.log(`${i} | ${row.enterprise_number}`);
const nace_codes = row.nace_codes ? JSON.parse(row.nace_codes) : '';
console.log('Parsed value: ', nace_codes);
});
} catch(error) {
console.log(`Crashed | ${error} `);
}
}
start();
Assuming that csvData does look like ['47299', '8690901', '4729903', '86909'].
What’s wrong is that single quote is not accepted in JSON, so JSON.parse throws an error.
To fix this you simply need to replace all occurrences of single quotes by double quotes like so:
const csvData = csvFile.toString().replaceAll("'", '"')

CSV to elasticsearch with python SerializationError

When i try to send the bulk_data to the local elasticsearch, my data isn't loaded because of the SerializationError.
I already tried to fill the empty cells in the csv file, but that wasn't the solution.
from elasticsearch import Elasticsearch
bulk_data = []
header = []
count = 0
for row in csv_file_object:
if count > 0 :
data_dict = {}
for i in range(len(row)):
row = row.rstrip()
data_dict[header[i]] = row[i]
op_dict = {
"index": {
"_index": INDEX_NAME,
"_type": TYPE_NAME,
}
}
bulk_data.append(op_dict)
bulk_data.append(data_dict)
else:
header = row
count = count+1
# create ES client, create index
es = Elasticsearch(hosts = [ES_HOST])
if es.indices.exists(INDEX_NAME):
print("deleting '%s' index..." % (INDEX_NAME))
res = es.indices.delete(index = INDEX_NAME)
res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = True)
See image for the SerializationError and bulk_data values:
Please note: the \n is added by the serialization process itself.
I try to repond to you but I can't understand one thing. How you retrieve your field name from data? In your code I see that you retrieve it from a list called header that is empty? I can't understand how you take this value.. Check my answer i don't know if i understand well
from elasticsearch import Elasticsearch
from elasticsearch import helpers
index_name = "your_index_name"
doc_type = "your_doc_type"
esConnector = Elasticsearch(["http://192.168.1.1:9200/"])
# change your ip here
count = 0
def generate_data(csv_file_object)
with open(csv_file_object, "r") as f:
for line in f:
line = line.split(",").rstrip()
data_dict = {header[count]: line}
obj={
'_op_type': 'index',
'_index': index_name,
'_type': doc_type,
'_id': count+1,
'_source': data_dict
}
count +=1
yield obj
for success, info in helpers.parallel_bulk(client=esConnector, actions=generate_data(csv_file_object), thread_count=4):
if not success:
print 'Doc failed', info

Python API JSON dictionary loop

My code is working, but I know there has to be a more efficient way to accomplish it and better exception handling.
API data format per item:
{
u'logEnd':False,
u'logBegin':True,
u'links':{
u'self': u'https://192.168.15.140/api/fmc_config/v1/domain/e276abec-e0f2-11e3-8169-6d9ed49b625f/policy/accesspolicies/00505689-7E52-0ed3-0000-184683643989/accessrules/00505689-7E52-0ed3-0000-000268439552'
},
'sourceNetworks':'any-src',
u'name':u'Peer-to-Peer Blocking',
'sourceZones':'any-src-zn',
u'enabled':True,
'sourcePorts':'any-src-port',
u'logFiles':False,
u'vlanTags':{ },
u'applications':{
u'applications':[
{
u'type':u'Application',
u'id':u'61',
u'name':u'BitTorrent'
},
{
u'type':u'Application',
u'id':u'571',
u'name':u'BitTorrent tracker'
},
{
u'type':u'Application',
u'id':u'1214',
u'name':u'ExtraTorrent'
}
]
},
u'sendEventsToFMC':True,
u'action':u'BLOCK',
'destinationPorts':'any-dest-port',
'destinationNetworks':'any-dest',
u'variableSet':{
u'type':u'VariableSet',
u'name':u'Default Set',
u'id':u'76fa83ea-c972-11e2-8be8-8e45bb1343c0'
},
u'type':u'AccessRule',
u'id':u'00505689-7E52-0ed3-0000-000268439552',
'destinationZones':'any-dst-zn',
u'metadata':{
u'category':u'--Undefined--',
u'accessPolicy':{
u'type':u'AccessPolicy',
u'name':u'PVmain1-ACPolicy-201610251131',
u'id':u'00505689-7E52-0ed3-0000-184683643989'
},
u'section':u'Mandatory',
u'domain':{
u'type':u'Domain',
u'name':u'Global',
u'id':u'e276abec-e0f2-11e3-8169-6d9ed49b625f'
},
u'timestamp':1482339574186
}
}
Working Python Script
for i in results:
response = requests.request("GET", i, headers=headers, verify=False)
raw=response.json()
raw.setdefault('name', "noname_rule")
raw.setdefault('action', "no_action")
raw.setdefault('sourceNetworks', "any-src")
raw.setdefault('destinationNetworks', "any-dest")
raw.setdefault('sourcePorts', "any-src-port")
raw.setdefault('destinationPorts', "any-dest-port")
raw.setdefault('sourceZones', "any-src-zn")
raw.setdefault('destinationZones', "any-dst-zn")
interesting_keys = ('name', 'action','sourceZones', 'sourceNetworks', 'sourcePorts', 'destinationZones', 'destinationNetworks', 'destinationPorts' )
subdict = {x: raw.get(x, "any") for x in interesting_keys if x in raw}
if 'objects' in subdict['sourceZones']:
srczn = subdict['sourceZones']['objects'][0]['name']
elif 'literals' in subdict['sourceZones']:
srczn = subdict['sourceZones']['literals'][0]['port']
else :
srczn = subdict['sourceZones']
if 'objects' in subdict['sourceNetworks']:
srcnet = subdict['sourceNetworks']['objects'][0]['name']
elif 'literals' in subdict['sourceNetworks']:
srcnet = subdict['sourceNetworks']['literals'][0]['value']
else :
srcnet = subdict['sourceNetworks']
if 'objects' in subdict['sourcePorts']:
srcprt = subdict['sourcePorts']['objects'][0]['name']
elif 'literals' in subdict['sourcePorts']:
srcprt = subdict['sourcePorts']['literals'][0]['port']
else :
srcprt = subdict['sourcePorts']
if 'objects' in subdict['destinationZones']:
dstzn = subdict['destinationZones']['objects'][0]['name']
elif 'literals' in subdict['destinationZones']:
dstzn = subdict['destinationZones']['literals'][0]['port']
else :
dstzn = subdict['destinationZones']
if 'objects' in subdict['destinationNetworks']:
dstnet = subdict['destinationNetworks']['objects'][0]['name']
elif 'literals' in subdict['destinationNetworks']:
dstnet = subdict['destinationNetworks']['literals'][0]['value']
else :
dstnet = subdict['destinationNetworks']
if 'objects' in subdict['destinationPorts']:
dstprt = subdict['destinationPorts']['objects'][0]['name']
elif 'literals' in subdict['destinationPorts']:
try:
dstprt = subdict['destinationPorts']['literals'][0]['port']
except KeyError:
dstprt = "0"
else :
dstprt = subdict['destinationPorts']
#print >> target, "%d,%s,%s,%s,%s,%s,%s,%s,%s" %(number, subdict['name'],subdict['action'],srczn,srcnet,srcprt,dstzn,dstnet,dstprt)
#print "%d,%s,%s,%s,%s,%s,%s,%s,%s" %(number, subdict['name'],subdict['action'],srczn,srcnet,srcprt,dstzn,dstnet,dstprt)
number+=1
time.sleep(.5)
print raw
Can you suggest other ways that would improve error handling, performance, readability, etc?
Whenever you see repeated code, perhaps it could be collapsed into one function that can reused over and over? Refactoring the code this way probably doesn't make it faster, but it does make it easier to read and understand.
def getSubdictData(subdict, fieldname, prop2):
if 'objects' in subdict[fieldname]:
return subdict[fieldname]['objects'][0]['name']
elif 'literals' in subdict[fieldname]:
if prop2 in subdict[fieldname]['literals'][0]
return subdict[fieldname]['literals'][0][prop2]
else:
return "0"
else :
return subdict[fieldname]
for i in results:
...
srczn = getSubdictData(subdict, 'sourceZones', 'port')
srcnet = getSubdictData(subdict, 'sourceNetworks', 'value')
srcprt = getSubdictData(subdict, 'sourcePorts', 'port')
dstzn = getSubdictData(subdict, 'destinationZones', 'port')
dstnet = getSubdictData(subdict, 'destinationNetworks', 'value')
dstnet = getSubdictData(subdict, 'destinationPorts', 'port')
...
Also note that I removed the try-catch you were using to detect if a field existed or not. If the object/code provides a way for you to determine that this field existed or not, we should use that instead of relying on a sloppy try catch to perform our logical operations. Try catches used like this are hard to read and perform badly. In this case it wouldn't matter, but its good practice not to abuse try catches like this.

How to go about storing users data in Json

I have been messing with Json a bit today and doing some trial and error but I couldn't seem to make this work:
def checkForNewBooty(chan):
j = urllib2.urlopen('http://tmi.twitch.tv/group/user/' + chan + '/chatters')
j_obj = json.load(j)
viewers = j_obj['chatters']['viewers']
moderators = j_obj['chatters']['moderators']
for x in viewers and moderators:
print(json.dumps('users' = {'johhny' = {'Points' = 0, 'Time Joined' = 9938}}))
Json example of what I'm trying to do:
{
users = {
"johhnyknoxville"
}
}
What is the proper way of doing this?
Python dictionaries (which are serialized using JSON) use : and not =.
Try:
json.dumps({'users': {'johhny': {'Points': 0, 'Time Joined': 9938}}})

Categories

Resources