I need to convert logical data dictionary to a physical (abbreviated) data dictionary - I have given 4 use cases below.
Need help for this psuedo-code / requirement:
# empty dict declaration
refDict = {}
# to catch and report on any 'not-found' dictionary words to replace
noMatchFound = {}
# read from a dictionary of comma delimited dictionary
# with open('dictionary.csv') as inputDict:
# for line in inputDict:
# busTerm, busAbbr = line.split(',')
# refDict[busTerm] = busAbbr.replace("\n","")
# sample data dictionary entries
refDict = {
'user': 'USR',
'call': 'CALL',
'detail': 'DTL',
'record': 'REC',
'call detail record': 'CDR',
'count', 'CNT'}
input_string1="user call detail record"
# output should be "USR_CDR"
# noMatchFound - will be empty - since all are matched and replaced
input_string2="user test call detail record"
# output should be "USR_TEST_CDR"
# noMatchFound - should have an entry "TEST" with a refernce to "user test call detail record"
input_string3="user call count detail record"
# output should be "USR_CALL_CNT_DTL_REC"
# noMatchFound - will be empty - since all are matched and replaced
input_string4="user call detail record count"
# output should be "USR_CDR_CNT"
# noMatchFound - will be empty - since all are matched and replaced
So far, I could figure out the code snippet for matching any possible one-single-largest-expression as:
import re
# using regular expressions find longest matcing expression
def getLongestSequenceSize(inputStr, inDict):
ret_match = ""
ret_match_len = 0
ret_abbr = ""
for inKey in inDict:
matches = re.findall(r'(?:\b%s\b\s?)+' % inKey.strip().upper(), inputStr.strip().upper())
if len(matches) > 0:
longest_match = max(matches)
if ret_match_len < len(longest_match):
ret_match_len = len(longest_match)
ret_match = longest_match.strip()
ret_abbr = inDict[inKey]
return [ret_match.strip(), ret_abbr.strip()]
The idea is you start trying to replace() from the biggest string in the dictionary and you check every possible replacement given the dictionary, from longer to shorter.
This exactly works as you expected:
refDict = {
'user': 'USR',
'call': 'CALL',
'detail': 'DTL',
'record': 'REC',
'call detail record': 'CDR',
'count': 'CNT'}
sorted_ref = sorted( refDict.items(), key=lambda x:len(x[0]), reverse = True )
def do_work(input_string):
noMatchFound = {}
rval = input_string[:]
for key, value in sorted_ref:
rval = rval.replace(key, value)
not_founds = [x for x in rval.split() if x.islower()]
for not_found in not_founds:
noMatchFound[not_found] = input_string
rval = rval.replace(not_found, not_found.upper())
rval = '_'.join( rval.split() )
return rval, noMatchFound
inputs = ["user call detail record", "user test call detail record",
"user call count detail record","user call detail record count"]
for inp in inputs:
print inp
output, noMatchFound = do_work(inp)
print output
print noMatchFound
print '---'
Output:
user call detail record
USR_CDR
{}
---
user test call detail record
USR_TEST_CDR
{'test': 'user test call detail record'}
---
user call count detail record
USR_CALL_CNT_DTL_REC
{}
---
user call detail record count
USR_CDR_CNT
{}
Related
i have a dictionary with entries that have the ip and ports displayed like this
{'source': '192.168.4.1:80', 'destination': '168.20.10.1:443'}
but i want it to display it like
{'src_ip': '192.168.4.1', 'src_port': 80, 'dest_ip': '168.20.10.1', 'dest_port': 443}
so i want to split the first two entries into 4 new ones and delete the two old ones.
my code currently looks like this:
log entry = {'source': '192.168.4.1:80', 'destination': '168.20.10.1:443'}
def split_ip_port(log_entry):
u_source = log_entry['source']
if ':' in u_source:
src_list = u_source.split(':')
src_ip = src_list[0]
src_port = src_list[1]
log_entry.update({'src_ip': src_ip})
log_entry.update({'src_port': src_port})
del log_entry['source']
u_dest = log_entry['destination']
if ':' in u_dest:
dest_list = u_dest.split(':')
dest_ip = dest_list[0]
dest_port = dest_list[1]
print(dest_list)
log_entry.update({'dest_ip': dest_ip})
log_entry.update({'dest_port': dest_port})
del log_entry['destination']
return log_entry
when i try to test the source it gives me keyerror :'destination' and when i try to test the destination it gives me keyerror source. what is happening here?
When you split value (e.g., log_entry['source'].split(":") ) it returns list ['192.168.4.1','80']. Then you have to return value by index from list, [0] index in list is '192.168.4.1'. Then you have to assign it to new key in your dict, log_entry['src_ip']
log_entry['src_ip'] = log_entry['source'].split(":")[0]
log_entry['src_port'] = log_entry['source'].split(":")[1]
log_entry['dest_ip'] = log_entry['destination'].split(":")[0]
log_entry['dest_port'] = log_entry['destination'].split(":")[1]
del log_entry['source']
del log_entry['destination']
Since the original code work. Here just an offer to simplify the original code - you could try to split the source/destination and ports then just create a new dictionary like this way:
orig_dc = {'source': '192.168.4.1:80', 'destination': '168.20.10.1:443'}
new_dc = {}
for k, v in orig_dc.items():
orig, port = v.split(':')
if k in 'source':
new_dc.setdefault('src_ip', orig)
new_dc.setdefault('src_port', int(port))
else:
new_dc.setdefault('dest_ip', orig)
new_dc.setdefault('dest_port', int(port))
expected = { 'src_ip': '192.168.4.1', 'src_port': 80,
'dest_ip': '168.20.10.1', 'dest_port': 443}
assert new_dc == expected
def message_probability(user_message, recognised_words, single_response=False, required_words=[]):
message_certainty = 0
has_required_words = True
# Counts how many words are present in each predefined message
for word in user_message:
if word in recognised_words:
message_certainty += 1
# Calculates the percent of recognised words in a user message
percentage = float(message_certainty) / float(len(recognised_words))
# Checks that the required words are in the string
for word in required_words:
if word not in user_message:
has_required_words = False
break
# Must either have the required words, or be a single response
if has_required_words or single_response:
return int(percentage * 100)
else:
return 0
def check_all_messages(message):
highest_prob_list = {}
# Simplifies response creation / adds it to the dict
def response(bot_response, list_of_words, single_response=False, required_words=[]):
nonlocal highest_prob_list
highest_prob_list[bot_response] = message_probability(message, list_of_words, single_response, required_words)
# Responses -------------------------------------------------------------------------------------------------------
response('Hello i am from Mobilis communication Agency .how can i helo you?', ['hello', 'hi', 'hey', 'sup', 'heyo'], single_response=True)
response('مرحبا كيف يمكنني مساعدتك', ['سلام', 'اهلا', 'مرحبا'], single_response=True)
Longer responses
response(long.R_ADVICE, ['give', 'advice'], required_words=['advice'])
best_match = max(highest_prob_list, key=highest_prob_list.get)
# print(highest_prob_list)
# print(f'Best match = {best_match} | Score: {highest_prob_list[best_match]}')
return long.unknown() if highest_prob_list[best_match] < 1 else best_match
Used to get the response
def get_response(user_input):
split_message = re.split(r'\s+|[,;?!.-]\s*', user_input.lower())
response = check_all_messages(split_message)
return response
Testing the response system
while True:
print('Bot: ' + get_response(input('You: ')))
lista =
[{Identity: joe,
summary:[
{distance: 1, time:2, status: idle},
{distance:2, time:5, status: moving}],
{unit: imperial}]
I can pull the data easily and put in pandas. The issue is, if an identity has multiple instances of, say idle, it takes the last value, instead of summing together.
my code...
zdrivershours = {}
zdistance = {}
zstophours = {}
For driver in resp:
driverid[driver['AssetID']] = driver['AssetName']
for value in [driver['SegmentSummary']]:
for value in value:
if value['SegmentType'] == 'Motion':
zdriverhours[driver['AssetID']] = round(value['Time']/3600,2)
if value['SegmentType'] == 'Stop':
zstophours[driver['AssetID']] = round(value['IdleTime']/3600,2)
zdistance[driver['AssetID']] = value['Distance']
To obtain the summatory of distance for every driver replace:
zdistance[driver['AssetID']] = value['Distance']
by
if driver['AssetID'] in zdistance:
zdistance[driver['AssetID']] = zdistance[driver['AssetID']] + value['Distance']
else:
zdistance[driver['AssetID']] = value['Distance']
Say I have a file with the following:
/* Full name: abc */
.....
.....(.....)
.....(".....) ;
/* .....
/* .....
..... : "....."
}
"....., .....
Car : true ;
House : true ;
....
....
Age : 33
....
/* Full name: xyz */
....
....
Car : true ;
....
....
Age : 56
....
I am only interested in full name, car, house and age of each person. There are many other lines of data with different format between the variable/attritbute that I am interested.
My code so far:
import re
initial_val = {'House': 'false', 'Car': 'false'}
with open('input.txt') as f:
records = []
current_record = None
for line in f:
if not line.strip():
continue
elif current_record is None:
people_name = re.search('.+Full name ?: (.+) ', line)
if people_name:
current_record = dict(initial_val, Name = people_name.group(1))
else:
continue
elif current_record is not None:
house = re.search(' *(House) ?: ?([a-z]+)', line)
if house:
current_record['House'] = house.group(2)
car = re.search(' *(Car) ?: ?([a-z]+)', line)
if car:
current_record['Car'] = car.group(2)
people_name = re.search('.+Full name ?: (.+) ', line)
if people_name:
records.append(current_record)
current_record = dict(initial_val, Name = people_name.group(1))
print records
What I get:
[{'Name': 'abc', 'House': 'true', 'Car': 'true'}]
My question:
How am I suppose to extract the data and store it in a dictionary like:
{'abc': {'Car': true, 'House': true, 'Age': 33}, 'xyz':{'Car': true, 'House': false, 'Age': 56}}
My purpose:
check whether each person has car, house and age, if no then return false
The I could print them in a table like this:
Name Car House Age
abc true true 33
xyz true false 56
Note that I am using Python 2.7 and I do not know what is the actual value of each variable/attribute (Eg. abc, true, true, 33) of each person.
What is the best solution to my question? Thanks.
Well, you just have to keep track of the current record:
def parse_name(line):
# first remove the initial '/* ' and final ' */'
stripped_line = line.strip('/* ')
return stripped_line.split(':')[-1]
WANTED_KEYS = ('Car', 'Age', 'House')
# default values for when the lines are not present for a record
INITIAL_VAL = {'Car': False, 'House': False, Age: -1}
with open('the_filename') as f:
records = []
current_record = None
for line in f:
if not line.strip():
# skip empty lines
continue
elif current_record is None:
# first record in the file
if line.startswith('/*'):
current_record = dict(INITIAL_VAL, name=parse_name(line))
else:
# this should probably be an error in the file contents
continue
elif line.startswith('/*'):
# this means that the current record finished, and a new one is starting
records.append(current_record)
current_record = dict(INITIAL_VAL, name=parse_name(line))
else:
key, val = line.split(':')
if key.strip() in WANTED_KEYS:
# we want to keep track of this field
current_record[key.strip()] = val.strip()
# otherwise just ignore the line
print('Name\tCar\tHouse\tAge')
for record in records:
print(record['name'], record['Car'], record['House'], record['Age'], sep='\t')
Note that for Age you may want to convert it to an integer using int:
if key == 'Age':
current_record['Age'] = int(val)
The above code produces a list of dictionaries, but it is easy enough to convert it to a dictionary of dicts:
new_records = {r['name']: dict(r) for r in records}
for val in new_records.values():
del val['name']
After this new_records will be something like:
{'abc': {'Car': True, 'House': True, Age: 20}, ...}
If you have other lines with a different format in between the interesting ones you can simply write a function that returns True or False depending on whether the line is in the format you require and use it to filter the lines of the file:
def is_interesting_line(line):
if line.startswith('/*'):
return True
elif ':' in line:
return True
for line in filter(is_interesting_line, f):
# code as before
Change is_interesting_line to suit your needs. In the end, if you have to handle several different formats etc. maybe using a regex would be better, in that case you could do something like:
import re
LINE_REGEX = re.compile(r'(/\*.*\*/)|(\w+\s*:.*)| <other stuff>')
def is_interesting_line(line):
return LINE_REGEX.match(line) is not None
If you want you can obtain fancier formatting for the table, but you probably first need to determine the maximum length of the name etc. or you can use something like tabulate to do that for you.
For example something like (not tested):
max_name_length = max(max(len(r['name']) for r in records), 4)
format_string = '{:<{}}\t{:<{}}\t{}\t{}'
print(format_string.format('Name', max_name_length, 'Car', 5, 'House', 'Age'))
for record in records:
print(format_string.format(record['name'], max_name_length, record['Car'], 5, record['House'], record['Age']))
Hi I'm in the process of learning so you may have to bear with me. I have 2 lists I'd like to compare whilst keeping any matches and append them whilst appending any non matches to another output list.
Heres my code:
def EntryToFieldMatch(Entry, Fields):
valid = []
invalid = []
for c in Entry:
count = 0
for s in Fields:
count +=1
if s in c:
valid.append(c)
elif count == len(Entry):
invalid.append(s)
Fields.remove(s)
print valid
print "-"*50
print invalid
def main():
vEntry = ['27/04/2014', 'Hours = 28', 'Site = Abroad', '03/05/2015', 'Date = 28-04-2015', 'Travel = 2']
Fields = ['Week_Stop', 'Date', 'Site', 'Hours', 'Travel', 'Week_Start', 'Letters']
EntryToFieldMatch(vEntry, Fields)
if __name__ = "__main__":
main()
the output seems fine except its not returning all the fields in the 2 output lists. This is the output I receive:
['Hours = 28', 'Site = Abroad', 'Date = 28-04-2015', 'Travel = 2']
--------------------------------------------------
['Week_Start', 'Letters']
I just have no idea why the second list doesn't include "Week_Stop". I've run the debugger and followed the code through a few times to no avail. I've read about sets but I didn't see any way to return fields that match and discard fields that don't.
Also im open to suggestion's if anybody knows of a way to simplify this whole process, I'm not asking for free code, just a nod in the right direction.
Python 2.7, Thanks
You only have two conditions, either it is in the string or the count is equal to the length of Entry, neither of which catch the first element 'Week_Stop', the length goes from 7-6-5 catching Week_Start but never gets to 0 so you never reach Week_Stop.
A more efficient way would be to use sets or a collections.OrderedDict if you want to keep order:
from collections import OrderedDict
def EntryToFieldMatch(Entry, Fields):
valid = []
# create orderedDict from the words in Fields
# dict lookups are 0(1)
st = OrderedDict.fromkeys(Fields)
# iterate over Entry
for word in Entry:
# split the words once on whitespace
spl = word.split(None, 1)
# if the first word/word appears in our dict keys
if spl[0] in st:
# add to valid list
valid.append(word)
# remove the key
del st[spl[0]]
print valid
print "-"*50
# only invalid words will be left
print st.keys()
Output:
['Hours = 28', 'Site = Abroad', 'Date = 28-04-2015', 'Travel = 2']
--------------------------------------------------
['Week_Stop', 'Week_Start', 'Letters']
For large lists this would be significantly faster than your quadratic approach. Having 0(1) dict lookups means your code goes from quadratic to linear, every time you do in Fields that is an 0(n) operation.
Using a set the approach is similar:
def EntryToFieldMatch(Entry, Fields):
valid = []
st = set(Fields)
for word in Entry:
spl = word.split(None,1)
if spl[0] in st:
valid.append(word)
st.remove(spl[0])
print valid
print "-"*50
print st
The difference using sets is order is not maintained.
Using list comprehension:
def EntryToFieldMatch(Entries, Fields):
# using list comprehension
# (typically they go on one line, but they can be multiline
# so they look more like their for loop equivalents)
valid = [entry for entry in Entries
if any([field in entry
for field in Fields])]
invalidEntries = [entry for entry in Entries
if not any([field in entry
for field in Fields])]
missedFields = [field for field in Fields
if not any([field in entry
for entry in Entries])]
print 'valid entries:', valid
print '-' * 80
print 'invalid entries:', invalidEntries
print '-' * 80
print 'missed fields:', missedFields
vEntry = ['27/04/2014', 'Hours = 28', 'Site = Abroad', '03/05/2015', 'Date = 28-04-2015', 'Travel = 2']
Fields = ['Week_Stop', 'Date', 'Site', 'Hours', 'Travel', 'Week_Start', 'Letters']
EntryToFieldMatch(vEntry, Fields)
valid entries: ['Hours = 28', 'Site = Abroad', 'Date = 28-04-2015', 'Travel = 2']
--------------------------------------------------------------------------------
invalid entries: ['27/04/2014', '03/05/2015']
--------------------------------------------------------------------------------
missed fields: ['Week_Stop', 'Week_Start', 'Letters']