Match words before square bracket and extract the value - python

Hi i have an log data
IPADDRESS['192.168.0.10'] - - PlATFORM['windows'] - - BROWSER['chrome'] - - EPOCH['1402049518'] - - HEXA['0x3c0593ee']
I want to extract the corresponding value inside the bracket using regular expression
for example (this does not work)
ipaddress = re.findall(r"\[(IPADDRESS[^]]*)",output)
ouput:
ipaddress = 192.168.0.10

You can simply get all the elements as a dictionary like this
print dict(re.findall(r'([a-zA-Z]+)\[\'(.*?)\'\]', data))
Output
{'BROWSER': 'chrome',
'EPOCH': '1402049518',
'HEXA': '0x3c0593ee',
'IPADDRESS': '192.168.0.10',
'PlATFORM': 'windows'}

s1 = "IPADDRESS['192.168.0.10'] - - PlATFORM['windows'] - - BROWSER['chrome'] - - EPOCH['1402049518'] - - HEXA['0x3c0593ee']"
import re
print re.findall('\[\'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\'\]',s1)[0]
192.168.0.10
To get all in a list:
s1 = "IPADDRESS['192.168.0.10'] - - PlATFORM['windows'] - - BROWSER['chrome'] - - EPOCH['1402049518'] - - HEXA['0x3c0593ee']"
print re.findall('\[\'(.*?)\'\]',s1)
['192.168.0.10', 'windows', 'chrome', '1402049518', '0x3c0593ee']
result=re.findall('\[\'(.*?)\'\]',s1)
ipaddress, platform, browser, epoch, hexa = result # assign all variables
print "ipaddress = {}, platform = {}, browser = {}, epoch = {}, hexa = {}".format(ipaddress,platform,browser,epoch,hexa)
ipaddress = 192.168.0.10, platform = windows, browser = chrome, epoch = 1402049518, hexa = 0x3c0593ee

Related

Python Json value overwritten by last value

lista =
[{Identity: joe,
summary:[
{distance: 1, time:2, status: idle},
{distance:2, time:5, status: moving}],
{unit: imperial}]
I can pull the data easily and put in pandas. The issue is, if an identity has multiple instances of, say idle, it takes the last value, instead of summing together.
my code...
zdrivershours = {}
zdistance = {}
zstophours = {}
For driver in resp:
driverid[driver['AssetID']] = driver['AssetName']
for value in [driver['SegmentSummary']]:
for value in value:
if value['SegmentType'] == 'Motion':
zdriverhours[driver['AssetID']] = round(value['Time']/3600,2)
if value['SegmentType'] == 'Stop':
zstophours[driver['AssetID']] = round(value['IdleTime']/3600,2)
zdistance[driver['AssetID']] = value['Distance']
To obtain the summatory of distance for every driver replace:
zdistance[driver['AssetID']] = value['Distance']
by
if driver['AssetID'] in zdistance:
zdistance[driver['AssetID']] = zdistance[driver['AssetID']] + value['Distance']
else:
zdistance[driver['AssetID']] = value['Distance']

Traceroute - Sometimes the sockets fail

I'm writing traceroute in Python3 as an exercise. It's working quite well, but every now and then, the trace will be unable to receive an answer from a certain hop and keep incrementing the TTL without ever getting a response. The regular linux traceroute program does not suffer from this problem so it has to be something in my code.
I've tried different variations, but they all suffer from the same problem. I'll run the trace on 8.8.8.8 and 4 out of 5 times, it works perfectly.
I've tried creating sockets using the context manager (with socket as ...) and also manually.
import os
import random
import socket
import sys
import time
class Traceroute:
def __init__(self, dest, maxhops=30, port=random.randrange(33434,33534), host=''):
self.dest = dest
self.maxhops = maxhops
self.port = port
self.host = host
self.hops = []
# Function for creating new sockets
def get_socket(self,type,proto=0):
s = socket.socket(
family=socket.AF_INET,
type=type,
proto=proto
)
s.bind((self.host,self.port))
return s
# Do the trace and store hop information for later use
def get_path(self):
try:
self.dest_ip = socket.gethostbyname(self.dest)
self.dest_name = socket.gethostbyaddr(self.dest)[0]
except socket.gaierror as e:
print(f'Error: {self.dest} - {e}')
address = ['']
ttl = 1
while ttl < self.maxhops +1 and address[0] != self.dest_ip:
receiver_socket = self.get_socket(socket.SOCK_RAW,socket.getprotobyname('icmp'))
receiver_socket.settimeout(2)
sender_socket = self.get_socket(socket.SOCK_DGRAM,socket.getprotobyname('udp'))
sender_socket.setsockopt(socket.SOL_IP,socket.IP_TTL,ttl)
sender_socket.sendto(b'',(self.dest_ip,self.port))
tries = 3
while tries > 0:
starttime = time.time()
try:
data, address = receiver_socket.recvfrom(1024)
latency = [round((time.time()-starttime)*1000,2)]
try:
hostname = socket.gethostbyaddr(address[0])[0]
except socket.herror as e:
hostname = address[0]
self.hops.append({
'address':address[0],
'hostname':hostname,
'latency': latency
})
tries = 0
except socket.error:
tries -= 1
if tries == 0:
self.hops.append({
'address':None,
'hostname':ttl,
'latency':None
})
finally:
receiver_socket.close()
sender_socket.close()
ttl += 1
os.system('clear')
self.draw()
def draw(self):
name = f'{self.dest_ip} - {self.dest_name}' if self.dest_name != self.dest_ip else self.dest_ip
print(f'Traceroute to {name}')
# Print the full hop list, for debugging
for hop in self.hops:
print(hop)
# Print the hops in a readable fashion
for hop in self.hops:
if hop['address'] is None:
print(f"{self.hops.index(hop)+1} - {'*':<50} - {'*':<50} - {'*':<50}")
else:
latencylen = len(hop['latency'])
latencyavg = str(round(sum(hop['latency'])/latencylen,2)) + 'ms'
name = f'{hop["address"]} - {hop["hostname"]}' if hop['address'] != hop['hostname'] else hop['address']
print(f'{self.hops.index(hop)+1} - {name:<50} - {latencyavg:<50} - LEN:{latencylen:<50}')
def main():
if len(sys.argv) < 2:
print('Please enter a destination')
else:
trace = Traceroute(sys.argv[1])
trace.get_path()
if __name__ == '__main__':
main()
Here is the output when it completely bugs out:
self.hops is a list of dictionaries.
The stuff in {brackets} is the hop dictionary itself. I added it for debugging.
Note that hops 2 and 3 are identical. So the trace somehow ran against this hop twice. The number at the beginning of the line is the list index, which also represents the TTL. I usually reach 8.8.8.8 by the 9th hop, but in this case it just keeps going, incrementing the TTL until it reaches the maximum.
Traceroute to 8.8.8.8 - google-public-dns-a.google.com
{'address': '192.168.1.1', 'hostname': '_gateway', 'latency': [8.21]}
{'address': '104.195.128.122', 'hostname': '104-195-128-122.cpe.teksavvy.com', 'latency': [1572.38]}
{'address': '104.195.128.122', 'hostname': '104-195-128-122.cpe.teksavvy.com', 'latency': [15.97]}
{'address': '104.195.129.9', 'hostname': '104-195-129-9.cpe.teksavvy.com', 'latency': [27.22]}
{'address': '206.248.155.92', 'hostname': 'ae10-0-bdr01-tor2.teksavvy.com', 'latency': [20.05]}
{'address': '206.248.155.10', 'hostname': 'ae12-0-bdr01-tor.teksavvy.com', 'latency': [27.2]}
{'address': None, 'hostname': 7, 'latency': None}
{'address': None, 'hostname': 8, 'latency': None}
{'address': None, 'hostname': 9, 'latency': None}
{'address': None, 'hostname': 10, 'latency': None}
{'address': None, 'hostname': 11, 'latency': None}
{'address': None, 'hostname': 12, 'latency': None}
{'address': None, 'hostname': 13, 'latency': None}
{'address': None, 'hostname': 14, 'latency': None}
{'address': None, 'hostname': 15, 'latency': None}
1 - 192.168.1.1 - _gateway - 8.21ms - LEN:1
2 - 104.195.128.122 - 104-195-128-122.cpe.teksavvy.com - 1572.38ms - LEN:1
3 - 104.195.128.122 - 104-195-128-122.cpe.teksavvy.com - 15.97ms - LEN:1
4 - 104.195.129.9 - 104-195-129-9.cpe.teksavvy.com - 27.22ms - LEN:1
5 - 206.248.155.92 - ae10-0-bdr01-tor2.teksavvy.com - 20.05ms - LEN:1
6 - 206.248.155.10 - ae12-0-bdr01-tor.teksavvy.com - 27.2ms - LEN:1
7 - * - * - *
8 - * - * - *
9 - * - * - *
10 - * - * - *
11 - * - * - *
12 - * - * - *
13 - * - * - *
14 - * - * - *
15 - * - * - *
Thanks,

How to access and replace values in a dict-list?

[python]
I have a multi-stage dict-list. I want to access the location in the dict-list and replace it with list data and data_result. I am able access the location and append to the values to a variable. However, I don't know how to create a variable that stores the entire dict-list and the replaced/appended list.
Multi-stage structure, focusing on data and results list
Note: all parameters are set to a value.
data = {
"uuid": str(uuid),
"name": str(tag + "_" + testName + "_" + str(datetime.datetime.utcnow())),
"verdict": str(totalResult)
}
print(data)
data_result = {
"Test_Case": str(Test_Case),
"Test_Result": str(result),
"Time Evaluation": str(duration)
}
res = tcm_template ## ** HERE **
res2 = tcm_template['testcatalogmanager']['ut']['tests']
for res3 in res2:
res4 = res3['data']
for res5 in res4: # # res4 in data list
final_result = res5['results']
final_result.append(data_result)
#tcm_template.replace(data) ## # data - prototype code
#tcm_template.append(final_result) ## # results - prototype code

Converting multi-line script output to dictionary using regex

I got the following script output:
***************************************************
[g4u2680c]: searching for domains
---------------------------------------------------
host = g4u2680c.houston.example.com
ipaddr = [16.208.16.72]
VLAN = [352]
Gateway= [16.208.16.1]
Subnet = [255.255.248.0]
Subnet = [255.255.248.0]
Cluster= [g4u2679c g4u2680c g9u1484c g9u1485c]
host = g4u2680c.houston.example.com
ipaddr = [16.208.16.72]
VLAN = [352]
Gateway= [16.208.16.1]
Subnet = [255.255.248.0]
Subnet = [255.255.248.0]
Cluster= [g4u2679c g4u2680c g9u1484c g9u1485c]
* script completed Mon Jun 15 06:13:14 UTC 2015 **
* sleeping 30 to avoid DOS on dns via a loop **
I need to extract the 2 host list into a dictionary, with out the brackets.
Here is my code:
#!/bin/env python
import re
text="""***************************************************
[g4u2680c]: searching for domains
---------------------------------------------------
host = g4u2680c.houston.example.com
ipaddr = [16.208.16.72]
VLAN = [352]
Gateway= [16.208.16.1]
Subnet = [255.255.248.0]
Subnet = [255.255.248.0]
Cluster= [g4u2679c g4u2680c g9u1484c g9u1485c]
host = g4u2680c.houston.example.com
ipaddr = [16.208.16.72]
VLAN = [352]
Gateway= [16.208.16.1]
Subnet = [255.255.248.0]
Subnet = [255.255.248.0]
Cluster= [g4u2679c g4u2680c g9u1484c g9u1485c]
* script completed Mon Jun 15 06:13:14 UTC 2015 **
* sleeping 30 to avoid DOS on dns via a loop **
***************************************************
"""
seq = re.compile(r"host.+?\n\n",re.DOTALL)
a=seq.findall(text)
matches = re.findall(r'\w.+=.+', a[0])
matches = [m.split('=', 1) for m in matches]
matches = [ [m[0].strip().lower(), m[1].strip().lower()] for m in matches]
#should have function with regular expression to remove bracket here
d = dict(matches)
print d
What I got so far for the first host:
{'subnet': '[255.255.248.0]', 'vlan': '[352]', 'ipaddr': '[16.208.16.72]', 'cluster': '[g4u2679c g4u2680c g9u1484c g9u1485c]', 'host': 'g4u2680c.houston.example.com', 'gateway': '[16.208.16.1]'}
I need help to find the regex to remove the bracket as the value in the dictionary contain data with and without bracket.
Or if there is a better and simpler way to transform the original script output into dictionary.
You can use: (\w+)\s*=\s*\[?([^\n\]]+)\]?
demo
import re
p = re.compile(ur'(\w+)\s*=\s*\[?([^\n\]]+)\]?', re.MULTILINE)
test_str = u"host = g4u2680c.houston.example.com\n ipaddr = [16.208.16.72]\n VLAN = [352]\n Gateway= [16.208.16.1]\n Subnet = [255.255.248.0]\n Subnet = [255.255.248.0]\n Cluster= [g4u2679c g4u2680c g9u1484c g9u1485c]\n\nhost = g4u2680c.houston.example.com\n ipaddr = [16.208.16.72]\n VLAN = [352]\n Gateway= [16.208.16.1]\n Subnet = [255.255.248.0]\n Subnet = [255.255.248.0]\n Cluster= [g4u2679c g4u2680c g9u1484c g9u1485c]\n"
re.findall(p, test_str)
You can simply use re.findall and dict :
>>> dict([(i,j.strip('[]')) for i,j in re.findall(r'(\w+)\s*=\s*(.+)',text)])
{'Subnet': '255.255.248.0', 'VLAN': '352', 'ipaddr': '16.208.16.72', 'Cluster': 'g4u2679c g4u2680c g9u1484c g9u1485c', 'host': 'g4u2680c.houston.example.com', 'Gateway': '16.208.16.1'}
And about the brackets you can remove them by str.strip method.
You can try out this.
matches = [m.replace('[','').replace(']','').split('=', 1) for m in matches]

Python: load text as python object [duplicate]

This question already has answers here:
How to convert raw javascript object to a dictionary?
(6 answers)
Closed 9 months ago.
I have a such text to load: https://sites.google.com/site/iminside1/paste
I'd prefer to create a python dictionary from it, but any object is OK. I tried pickle, json and eval, but didn't succeeded. Can you help me with this?
Thanks!
The results:
a = open("the_file", "r").read()
json.loads(a)
ValueError: Expecting property name: line 1 column 1 (char 1)
pickle.loads(a)
KeyError: '{'
eval(a)
File "<string>", line 19
from: {code: 'DME', airport: "Домодедово", city: 'Москва', country: 'Россия', terminal: ''},
^
SyntaxError: invalid syntax
Lifted almost straight from the pyparsing examples page:
# read text from web page
import urllib
page = urllib.urlopen("https://sites.google.com/site/iminside1/paste")
html = page.read()
page.close()
start = html.index("<pre>")+len("<pre>")+3 #skip over 3-byte header
end = html.index("</pre>")
text = html[start:end]
print text
# parse dict-like syntax
from pyparsing import (Suppress, Regex, quotedString, Word, alphas,
alphanums, oneOf, Forward, Optional, dictOf, delimitedList, Group, removeQuotes)
LBRACK,RBRACK,LBRACE,RBRACE,COLON,COMMA = map(Suppress,"[]{}:,")
integer = Regex(r"[+-]?\d+").setParseAction(lambda t:int(t[0]))
real = Regex(r"[+-]?\d+\.\d*").setParseAction(lambda t:float(t[0]))
string_ = Word(alphas,alphanums+"_") | quotedString.setParseAction(removeQuotes)
bool_ = oneOf("true false").setParseAction(lambda t: t[0]=="true")
item = Forward()
key = string_
dict_ = LBRACE - Optional(dictOf(key+COLON, item+Optional(COMMA))) + RBRACE
list_ = LBRACK - Optional(delimitedList(item)) + RBRACK
item << (real | integer | string_ | bool_ | Group(list_ | dict_ ))
result = item.parseString(text,parseAll=True)[0]
print result.data[0].dump()
print result.data[0].segments[0].dump(indent=" ")
print result.data[0].segments[0].flights[0].dump(indent=" - ")
print result.data[0].segments[0].flights[0].flightLegs[0].dump(indent=" - - ")
for seg in result.data[6].segments:
for flt in seg.flights:
fltleg = flt.flightLegs[0]
print "%(airline)s %(airlineCode)s %(flightNo)s" % fltleg,
print "%s -> %s" % (fltleg["from"].code, fltleg["to"].code)
Prints:
[['index', 0], ['serviceClass', '??????'], ['prices', [3504, ...
- eTicketing: true
- index: 0
- prices: [3504, 114.15000000000001, 89.769999999999996]
- segments: [[['indexSegment', 0], ['stopsCount', 0], ['flights', ...
- serviceClass: ??????
[['indexSegment', 0], ['stopsCount', 0], ['flights', [[['index', 0], ...
- flights: [[['index', 0], ['time', 'PT2H45M'], ['minAvailSeats', 9], ...
- indexSegment: 0
- stopsCount: 0
- [['index', 0], ['time', 'PT2H45M'], ['minAvailSeats', 9], ['flight...
- - flightLegs: [[['flightNo', '309'], ['eTicketing', 'true'], ['air...
- - index: 0
- - minAvailSeats: 9
- - stops: []
- - time: PT2H45M
- - [['flightNo', '309'], ['eTicketing', 'true'], ['airplane', 'Boe...
- - - airline: ?????????
- - - airlineCode: UN
- - - airplane: Boeing 737-500
- - - availSeats: 9
- - - classCode: I
- - - eTicketing: true
- - - fareBasis: IPROW
- - - flightClass: ECONOMY
- - - flightNo: 309
- - - from: - - [['code', 'DME'], ['airport', '??????????'], ...
- - - airport: ??????????
- - - city: ??????
- - - code: DME
- - - country: ??????
- - - terminal:
- - - fromDate: 2010-10-15
- - - fromTime: 10:40:00
- - - time:
- - - to: - - [['code', 'TXL'], ['airport', 'Berlin-Tegel'], ...
- - - airport: Berlin-Tegel
- - - city: ??????
- - - code: TXL
- - - country: ????????
- - - terminal:
- - - toDate: 2010-10-15
- - - toTime: 11:25:00
airBaltic BT 425 SVO -> RIX
airBaltic BT 425 SVO -> RIX
airBaltic BT 423 SVO -> RIX
airBaltic BT 423 SVO -> RIX
EDIT: fixed grouping and expanded output dump to show how to access individual key fields of results, either by index (within list) or as attribute (within dict).
If you really have to load the bulls... this data is (see my comment), you's propably best of with a regex adding missing quotes. Something like r"([a-zA-Z_][a-zA-Z_0-9]*)\s*\:" to find things to quote and r"\'\1\'\:" as replacement (off the top of my head, I have to test it first).
Edit: After some troulbe with backward-references in Python 3.1, I finally got it working with these:
>>> pattern = r"([a-zA-Z_][a-zA-Z_0-9]*)\s*\:"
>>> test = '{"foo": {bar: 1}}'
>>> repl = lambda match: '"{}":'.format(match.group(1))
>>> eval(re.sub(pattern, repl, test))
{'foo': {'bar': 1}}
Till now with help of delnan and a little investigation I can load it into dict with eval:
pattern = r"\b(?P<word>\w+):"
x = re.sub(pattern, '"\g<word>":',open("the_file", "r").read())
y = x.replace("true", '"true"')
d = eval(y)
Still looking for more efficient and maybe simpler solution.. I don't like to use "eval" for some reasons.
Extension of the DominiCane's version:
import re
quote_keys_regex = re.compile(r'([\{\s,])(\w+)(:)')
def js_variable_to_python(js_variable):
"""Convert a javascript variable into JSON and then load the value"""
# when in_string is not None, it contains the character that has opened the string
# either simple quote or double quote
in_string = None
# cut the string:
# r"""{ a:"f\"irst", c:'sec"ond'}"""
# becomes
# ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
l = re.split(r'(["\'])', js_variable)
# previous part (to check the escape character antislash)
previous_p = ""
for i, p in enumerate(l):
# parse characters inside a ECMA string
if in_string:
# we are in a JS string: replace the colon by a temporary character
# so quote_keys_regex doesn't have to deal with colon inside the JS strings
l[i] = l[i].replace(':', chr(1))
if in_string == "'":
# the JS string is delimited by simple quote.
# This is not supported by JSON.
# simple quote delimited string are converted to double quote delimited string
# here, inside a JS string, we escape the double quote
l[i] = l[i].replace('"', r'\"')
# deal with delimieters and escape character
if not in_string and p in ('"', "'"):
# we are not in string
# but p is double or simple quote
# that's the start of a new string
# replace simple quote by double quote
# (JSON doesn't support simple quote)
l[i] = '"'
in_string = p
continue
if p == in_string:
# we are in a string and the current part MAY close the string
if len(previous_p) > 0 and previous_p[-1] == '\\':
# there is an antislash just before: the JS string continue
continue
# the current p close the string
# replace simple quote by double quote
l[i] = '"'
in_string = None
# update previous_p
previous_p = p
# join the string
s = ''.join(l)
# add quote arround the key
# { a: 12 }
# becomes
# { "a": 12 }
s = quote_keys_regex.sub(r'\1"\2"\3', s)
# replace the surogate character by colon
s = s.replace(chr(1), ':')
# load the JSON and return the result
return json.loads(s)
It deals only with int, null and string. I don't know about float.
Note that the usage chr(1): the code doesn't work if this character in js_variable.

Categories

Resources