Python tnsnames.ora parser - python

I need a dict containing all database connections from tnsnames.ora file.
I need to go from this :
(DESCRIPTION=(ADDRESS_LIST=(ADDRESS=(PROTOCOL=TCP)(HOST=mydbserver.mydomain.com)(PORT=1521)))(CONNECT_DATA=(SID=CATAL)(SERVER=DEDICATED)(SERVICE_NAME=mydb.mydomain.com)))
to this :
{'DESCRIPTION': [{'ADDRESS_LIST': [{'ADDRESS': [{'PROTOCOL': 'TCP'},
{'HOST': 'mydbserver.mydomain.com'},
{'PORT': '1521'}
]
}]
},
{'CONNECT_DATA': [{'SID': 'CATAL'},
{'SERVER': 'DEDICATED'},
{'SERVICE_NAME': 'mydb.mydomain.com'}
]
}
]
}
So far, my code is :
def get_param(param_string):
print("get_param input:", param_string)
if param_string.count("(") != param_string.count(")"):
raise Exception("Number of '(' is not egal to number of ')' : " + str(param_string.count("(")) + " and " + str(param_string.count(")")))
else:
param_string = param_string[1:-1]
splitted = param_string.split("=")
keywork = splitted[0]
if len(splitted) == 2:
return {keywork: splitted[1]}
else:
splitted.remove(keywork)
values = "=".join(splitted)
return {keywork: get_value_list(values)}
def get_value_list(value_string):
print("get_value_list input:", value_string)
to_return = list()
if "=" not in value_string and "(" not in value_string and ")" not in value_string:
to_return.append(value_string)
elif value_string[0] != "(":
raise Exception("[ERROR] Format error '(' is not the first char: " + repr(value_string))
else:
parenth_count = 0
strlen = len(value_string)
current_value = ""
for i in range(0,strlen):
current_char = value_string[i]
current_value += current_char
if current_char == "(":
parenth_count += 1
elif current_char == ")":
parenth_count += -1
if parenth_count == 0:
to_return.append(get_param(current_value))
if i != (strlen - 1):
if value_string[i+1] == "(":
to_return += get_value_list(value_string[i+1:])
else:
raise Exception("Format error - Next char should be a '('. value_string[i+1]:" + repr(value_string[i+1]) )
break
print("get_value_list return:", to_return)
if len(to_return) == 0:
to_return = ""
elif len(to_return) == 1:
to_return = to_return[0]
return to_return
connection_infos = "(DESCRIPTION=(ADDRESS_LIST=(ADDRESS=(PROTOCOL=TCP)(HOST=mydbserver.mydomain.com)(PORT=1521)))(CONNECT_DATA=(SID=CATAL)(SERVER=DEDICATED)(SERVICE_NAME=mydb.mydomain.com)))"
current_connection = get_param(connection_infos)
print("current_connection:", current_connection)
pprint(current_connection)
And I got this :
{'DESCRIPTION': [{'ADDRESS_LIST': {'ADDRESS': [{'PROTOCOL': 'TCP'},
{'HOST': 'mydbserver.mydomain.com'},
'PORT']
}
},
'CONNECT_DATA'
]
}
So I'm doing something wrong. And I feel I'm doing something too complicated. Would anyone point some mistake I made or help me find a simpler way to do this ?

I have a working code now, but I'm not really satisfied with it. It's too long, not flexible, and will not work with some other tnsnames.ora possible formats :
class Tnsnames():
def __init__(self, file_path, file_name='tnsnames.ora'):
self.file_path = file_path
self.file_name = file_name
self.load_file()
def load_file(self):
try:
fhd = open(os.path.join(self.file_path, self.file_name), 'rt', encoding='utf-8')
except:
raise
else:
#Oracle doc : https://docs.oracle.com/cd/B28359_01/network.111/b28317/tnsnames.htm#NETRF007
file_content = list()
for l in fhd:
l = " ".join(l.split()).strip(" \n")
if len(l) > 0:
if "#" not in l:
file_content.append(l)
fhd.close()
file_content = " ".join(file_content)
connections_list = dict()
current_depth = 0
current_word = ""
current_keyword = ""
name_to_register = ""
is_in_add_list = False
current_addr = dict()
connections_aliases = dict()
stop_registering = False
connections_duplicates = list()
for c in file_content:
if c == " ":
pass
elif c == "=":
current_keyword = str(current_word)
current_word = ""
if current_keyword == "ADDRESS_LIST":
is_in_add_list = True
elif c == "(":
if current_depth == 0:
current_keyword = current_keyword.upper()
names_list = current_keyword.replace(" ","").split(",")
if len(names_list) == 1:
name_to_register = names_list[0]
else:
name_to_register = None
# We use either the first name with at least
# a dot in it, or the longest one.
for n in names_list:
if "." in n:
name_to_register = n
break
else:
name_to_register = max(names_list, key=len)
names_list.remove(name_to_register)
for n in names_list:
if n in connections_aliases.keys():
print("[ERROR] already registered alias:", n,
". Registered to:", connections_aliases[n],
". New:", name_to_register,
". This possible duplicate will not be registered.")
connections_duplicates.append(n)
stop_registering = True
else:
connections_aliases[n] = name_to_register
if not stop_registering:
connections_list[name_to_register] = {"ADDRESS_LIST": list(),
"CONNECT_DATA": dict(),
"LAST_TEST_TS": None}
current_depth += 1
elif current_depth in [1,2,3]:
current_depth += 1
else:
print("[ERROR] Incorrect depth:", repr(current_depth), ". Current connection will not be registered" )
del connections_list[name_to_register]
stop_registering = True
elif c == ")":
if current_depth == 1:
if stop_registering:
stop_registering = False
else:
# Before moving to next connection,
# we check that current connection
# have at least a HOST, and a SID or
# SERVICE_NAME
connection_is_valid = True
if isinstance(connections_list[name_to_register]["ADDRESS_LIST"], dict):
if "HOST" not in connections_list[name_to_register]["ADDRESS_LIST"].keys():
print("[ERROR] Only one address defined, and no HOST defined. Current connection will not be registered:", name_to_register)
connection_is_valid = False
elif isinstance(connections_list[name_to_register]["ADDRESS_LIST"], list):
for current_address in connections_list[name_to_register]["ADDRESS_LIST"]:
if "HOST" in current_address.keys():
break
else:
print("[ERROR] Multiple addresses but none with HOST. Current connection will not be registered:", name_to_register)
connection_is_valid = False
else:
print("[ERROR] Incorrect address format:", connections_list[name_to_register]["ADDRESS_LIST"], " Connection:", name_to_register)
connection_is_valid = False
if not connection_is_valid:
del connections_list[name_to_register]
else:
if "SERVICE_NAME" not in connections_list[name_to_register]["CONNECT_DATA"].keys() and \
"SID" not in connections_list[name_to_register]["CONNECT_DATA"].keys():
print("[ERROR] Missing SERVICE_NAME / SID for connection:", name_to_register)
del connections_list[name_to_register]
elif current_depth == 2:
if is_in_add_list:
is_in_add_list = False
if not stop_registering:
if len(connections_list[name_to_register]["ADDRESS_LIST"]) == 1:
connections_list[name_to_register]["ADDRESS_LIST"] = connections_list[name_to_register]["ADDRESS_LIST"][0]
elif current_depth == 3:
if is_in_add_list:
if not stop_registering:
connections_list[name_to_register]["ADDRESS_LIST"].append(current_addr)
current_addr = dict()
elif current_keyword.upper() in ["SID", "SERVER", "SERVICE_NAME"]:
if not stop_registering:
connections_list[name_to_register]["CONNECT_DATA"][current_keyword.upper()] = current_word.upper()
elif current_depth == 4:
if is_in_add_list:
if not stop_registering:
current_addr[current_keyword.upper()] = current_word.upper()
current_keyword = ""
current_word = ""
current_depth += -1
else:
current_word += c
self.connections = connections_list
self.aliases = connections_aliases
self.duplicates = connections_duplicates
Test tnsnames.ora :
########################################
# This is a sample tnsnames.ora #
########################################
###################################################
# PRODDB
###################################################
proddb.mydbs.domain.com, PRODDB =
(DESCRIPTION =
(ADDRESS_LIST =
(ADDRESS = (PROTOCOL = TCP)(HOST = proddb1.mydbs.domain.com)(PORT = 1522))
(ADDRESS = (PROTOCOL = TCP)(HOST = proddb2.mydbs.domain.com)(PORT = 1522))
(ADDRESS = (PROTOCOL = TCP)(HOST = proddb3.mydbs.domain.com)(PORT = 1522))
(ADDRESS = (PROTOCOL = TCP)(HOST = proddb4.mydbs.domain.com)(PORT = 1522))
)
(CONNECT_DATA =
(SID = PRODDB)
(SERVER = DEDICATED)
(SERVICE_NAME = proddb.mydbs.domain.com)
)
)
###################################################
# DEVDBA : Test database for DBA usage
###################################################
devdba.mydbs.domain.com, DEVDBA =
(DESCRIPTION =
(ADDRESS_LIST =
(ADDRESS = (PROTOCOL = TCP)(HOST = devdba.mydbs.domain.com)(PORT = 1521))
)
(CONNECT_DATA =
(SID = DEVDBA)
)
)
Test code :
from pprint import pprint
from lib_database import Tnsnames
tnsnnames = Tnsnames('/usr/lib/oracle/12.2/client64/network/admin')
print('Connexions:')
pprint(tnsnnames.connections)
print('Aliases:')
pprint(tnsnnames.aliases)
print('Duplicates:')
pprint(tnsnnames.duplicates)
Output :
Connexions:
{'DEVDBA.MYDBS.DOMAIN.COM': {'ADDRESS_LIST': {'HOST': 'DEVDBA.MYDBS.DOMAIN.COM',
'PORT': '1521',
'PROTOCOL': 'TCP'},
'CONNECT_DATA': {'SID': 'DEVDBA'},
'PRODDB.MYDBS.DOMAIN.COM': {'ADDRESS_LIST': [{'HOST': 'PRODDB1.MYDBS.DOMAIN.COM',
'PORT': '1522',
'PROTOCOL': 'TCP'},
{'HOST': 'PRODDB2.MYDBS.DOMAIN.COM',
'PORT': '1522',
'PROTOCOL': 'TCP'},
{'HOST': 'PRODDB3.MYDBS.DOMAIN.COM',
'PORT': '1522',
'PROTOCOL': 'TCP'},
{'HOST': 'PRODDB4.MYDBS.DOMAIN.COM',
'PORT': '1522',
'PROTOCOL': 'TCP'}],
'CONNECT_DATA': {'SERVER': 'DEDICATED',
'SERVICE_NAME': 'PRODDB.MYDBS.DOMAIN.COM',
'SID': 'PRODDB'}}
Aliases:
{'DEVDBA': 'DEVDBA.MYDBS.DOMAIN.COM', 'PRODDB': 'PRODDB.MYDBS.DOMAIN.COM'}
Duplicates:
[]
I could not find other Python parser for tnsnames.ora files. If you know about one, please point me to it.

You can do this with pyparsing:
import pyparsing as pp
# 1. Literals
VAR = pp.Word(pp.alphas + "_", pp.alphanums + "_").setName('variable')
SPACE = pp.Suppress(pp.Optional(pp.White()))
EQUALS = SPACE + pp.Suppress('=') + SPACE
OPEN = pp.Suppress('(') + SPACE
CLOSE = pp.Suppress(')') + SPACE
INTEGER = pp.Optional('-') + pp.Word(pp.nums) + ~pp.Char(".")
INTEGER.setParseAction(lambda t: int(t[0]))
FLOAT = pp.Optional('-') + pp.Word(pp.nums) + pp.Char('.') + pp.Optional(pp.Word(pp.nums))
FLOAT.setParseAction(lambda t: float(t[0]))
STRING = pp.Word(pp.alphanums + r'_.-')
# 2. Literal assignment expressions: (IDENTIFIER = VALUE)
INTEGER_ASSIGNMENT = pp.Group(OPEN + VAR + EQUALS + INTEGER + CLOSE)
FLOAT_ASSIGNMENT = pp.Group(OPEN + VAR + EQUALS + FLOAT + CLOSE)
STRING_ASSIGNMENT = pp.Group(OPEN + VAR + EQUALS + STRING + CLOSE)
# 3. Nested object assignment
ASSIGNMENT = pp.Forward()
NESTED_ASSIGNMENT = pp.Group(OPEN + VAR + EQUALS + ASSIGNMENT + CLOSE)
ASSIGNMENT << pp.OneOrMore(INTEGER_ASSIGNMENT |
FLOAT_ASSIGNMENT |
STRING_ASSIGNMENT |
NESTED_ASSIGNMENT)
# 4. Net service name(s): NAME(.DOMAIN)[, NAME(.DOMAIN)...]
NET_SERVICE_NAME = pp.OneOrMore(pp.Word(pp.alphas + '_' + '.', pp.alphanums + '_' + '.')
+ pp.Optional(pp.Suppress(',')))
# 5. Full TNS entry
TNS_ENTRY = NET_SERVICE_NAME + EQUALS + ASSIGNMENT
Here is some example data:
TNS_NAMES_ORA = """
MYDB =
(DESCRIPTION =
(ADDRESS_LIST =
(ADDRESS = (PROTOCOL = TCP)(HOST = server01)(PORT = 25881))
)
(CONNECT_DATA =
(SID = MYDB01)
)
)
OTHERDB.DOMAIN, ALIAS_FOR_OTHERDB.DOMAIN =
(DESCRIPTION_LIST =
(DESCRIPTION =
(ADDRESS_LIST = (ADDRESS = (PROTOCOL = TCP)
(HOST = server02)
(PORT = 25881)
))
(CONNECT_DATA = (SID = MYDB02))
)
)
"""
This part is a little one-off-ish, but here's an example of putting some additional parsing on top of that in order to extract all the data:
def _parse_addresses(tns_entry, addresses):
"""
Parse ADDRESS keywords from the a TNS entry
:param definition: Unparsed part of the TNS entry
:param addresses: List of addresses parsed
"""
keyword = tns_entry[0]
# Base Case: We found an ADDRESS, so extract the data
# and do not recurse into it
if keyword.upper() == 'ADDRESS':
port = None
host = None
for k, v in tns_entry[1:]:
if k == 'PORT':
port = v
elif k == 'HOST':
host = v
if port is None:
print('WARNING: Ignoring ADDRESS due to missing PORT')
elif host is None:
print('WARNING: Ignoring ADDRESS due to missing HOST')
addresses.append({'host': host, 'port': port})
# Else recursively descend through the definition
for d in tns_entry[1:]:
# Only parse sub-lists, not literals
if isinstance(d, list):
_parse_addresses(d, addresses)
def _parse_connect_data(tns_entry, sids):
"""
Parse CONNECT_DATA keywords from the a TNS entry
:param definition: Unparsed part of the TNS entry
:param sids: List of Oracle SIDs
"""
keyword = tns_entry[0]
# Base Case: We found a CONNECT_DATA, so extract the data
# and do not recurse into it
if keyword.upper() == 'CONNECT_DATA':
sid = None
for k, v in tns_entry[1:]:
if k == 'SID':
sid = v
if sid is None:
print('WARNING: Ignoring CONNECT_DATA due to missing SID')
sids.append(sid)
for d in tns_entry[1:]:
# Only parse sub-lists, not literals
if isinstance(d, list):
_parse_connect_data(d, sids)
def get_connection_info(net_service_name: str, tns_string: str):
"""
Generator over all simple connections inferred from a TNS entry
:param net_service_name: Net service name to return connection info for
:param tns_string: tnsnames.ora file contents
"""
# Parse the TNS entries and keep the requested definition
definition = None
for tokens, _start, _end in TNS_ENTRY.scanString(tns_string):
if net_service_name in tokens.asList()[0]:
definition = tokens.asList()[1]
break
# Check if we found a definition
if definition is None:
raise KeyError(f'No net service named {net_service_name}')
# Look for all the ADDRESS keywords
addresses = []
_parse_addresses(definition, addresses)
# Look for all CONNECT_DATA keywords
sids = []
_parse_connect_data(definition, sids)
# Emit all combinations
for address in addresses:
for sid in sids:
yield {'sid': sid, **address}
# Try it out!
for connection_info in get_connection_info('MYDB', TNS_NAMES_ORA):
print(connection_info)
I wrote a blog post about it here for "fun":
https://unparameterized.blogspot.com/2021/02/parsing-oracle-tns-files-in-python.html

import re
def find_match(tns_regex, y):
x1 = re.match(tns_regex, y, re.M + re.I + re.S)
if x1 is not None:
x1 = x1.groups(1)[0] # Only first match is returned
x1 = x1.strip('\n')
return(x1)
# Removing commented text
with open("C:\\Oracle\\product\\11.2.0\\client_1\\network\\admin\\tnsnames.ora") as tns_file:
with open("test_tns.ora", 'w+') as output:
lines =tns_file.readlines()
for line in lines:
if not line.startswith('#'):
output.write(line)
with open('test_tns.ora') as tns_file:
tnsnames = tns_file.read()
tnsnames1 = re.split(r"\){3,}\n\n", tnsnames)
# Regex matches
tns_name = '^(.+?)\s?\=\s+\(DESCRIPTION.*'
tns_host = '.*?HOST\s?=\s?(.+?)\)'
tns_port = '.*?PORT\s?=\s?(\d+?)\)'
tns_sname = '.*?SERVICE_NAME\s?=\s?(.+?)\)'
tns_sid = '.*?SID\s?=\s?(.+?)\)'
easy_connects = []
for y in tnsnames1:
y = '%s))' % y
l = [find_match(x, y) for x in [tns_name, tns_host, tns_port, tns_sname, tns_sid]]
d = {
'name': l[0],
'host': l[1],
'port': l[2],
'service_name': l[3],
'sid': l[4]
}
easy_connects.append(d)
print(easy_connects)

I have written this small code. It parses tnsnames.ora. It is fast and works great.
def parse_ora_tns_file(fpath,tnskey=None,return_all_keys=False,view_file=False,logger=None)->str:
"""
This function parse oracle tns file
parms: fpath : full file path like
fpath=filepath\tnsnames.ora
param: tnskey: find tns entry for given tns key like tnskey='ABC.WORLD'
param: return_all_keys: if True it will return all tns key names
param: view_file : if True it returns tnsnames.ora as str
"""
clean_tns_file=''
if logger:
logger.info('Reading tnsnames ora file at {} ...'.format(fpath))
with open(fpath,mode='r') as tns_file:
lines =tns_file.readlines()
for line in lines:
if not line.startswith('#'):
clean_tns_file=clean_tns_file+line
#clean file
clean_str = clean_tns_file.replace('\n','')
clean_str = clean_str.replace('\t','')
#replace = with ' ' so later I can split with ' '
#with below it becomes like ABC.WORLD = (DESCRIPTION) to ABC.WORLD ' ' (DESCRIPTION)
clean_str = clean_str.replace('=',' ')
#Below one output to ['ABC.WORLD','',' (DESCRIPTION)']
lstresult= clean_str.split(" ")
#Below code remove extra space from char list like it becomes ['ABC.WORLD','','(DESCRIPTION)']
lstresult = [txt.strip() for txt in lstresult]
#Below code to replace () chars to '' from list so output as ['ABC.WORLD','','DESCRIPTION']
removetable = str.maketrans('', '', '()')
out_list = [s.translate(removetable) for s in lstresult]
#Below code remove any empty list items so output as ['ABC.WORLD','DESCRIPTION']
out_list = list(filter(None, out_list))
#find index of DESCRIPTION words
indices = [i for i, x in enumerate(out_list ) if x.upper() == "DESCRIPTION"]
tns_keys= ""
for i in indices:
#use index of DESCRIPTION to tns keys which is required for regx pattern below.
tns_keys=tns_keys+out_list[i-1]+"|"
if return_all_keys:
return tns_keys.replace('|',',')[:-1]
if logger:
logger.info('Keys found in tnsnames ora: {}'.format(tns_keys))
regex = r"\s+(?!^({}))".format(tns_keys)
result = re.sub(regex, '', clean_tns_file, 0, re.MULTILINE)
if view_file:
return result
if result:
for match in re.finditer(r'^((?:{}))(.*)'.format(tns_keys), result, re.MULTILINE):
if match.group(1) == tnskey:
#removing = sign from start of entry
if logger:
logger.info('Found tns entry: {} {}'.format(match.group(1),match.group(2)))
return match.group(2)[1:]
if logger:
logger.info('No tns entry found for {}'.format(tnskey))
return None

Related

Selenium Webscraper Unbound local error in set [ ]

I inherited a bit of Python code and I have no background. I am getting unbound local error and is probably something really silly on my part.
UnboundLocalError Traceback (most recent call last)
Input In [1], in <cell line: 372>()
368 print('\n----------------------------------------------------------------------\n')
369 ##############################################################################
--> 372 main()
Input In [1], in main()
319 Gender = p.getGender()
320 StateRes = p.getStateRes()
--> 321 children = immunte(Fname, Lname, DOB, Gender, driver)
323 if children == []:
324 not_found += 1
Input In [1], in immunte(Fname, Lname, DOB, Gender, driver)
204 except WebDriverException:
205 al = []
--> 207 return al
UnboundLocalError: local variable 'al' referenced before assignment
I have looked at this for a few days now and I can't seem to find an answer to this problem even though it is likely simple. It seems a solution to this error is a global keyword somewhere but I am not sure if this applies here as every time I tried to apply global to al = [] i got an error or same result. Any help is appreciated, thank you.
# Imports
import csv
import datetime
import os
import os.path
import time
import pandas as pd
from dateutil.parser import parse
from pandas import DataFrame
from selenium import webdriver
from selenium.common.exceptions import (NoSuchElementException,
WebDriverException)
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.service import Service
##############################################################################
# Classes
class Person(object):
def __init__(self, measYr, MEMID, MIDSK, fname, lname, LNMSFX, DRB, GDR, STRES, meas):
self.measYr = measYr
self.MEMID = MEMID
self.MIDSK = MIDSK
self.fname = fname
self.lname = lname
self.LNMSFX = LNMSFX
self.DRB = DRB
self.GDR = GDR
self.STRES = STRES
self.meas = meas
def GTMESYR(self):
return self.measYr
def GTMEMSKY(self):
return self.MIDSK
def GTMEMID(self):
return self.MEMID
def GTFSNM(self):
return self.fname
def GTLSNM(self):
return self.lname
def GTLSTNMSF(self):
return self.LNMSFX
def GTDRB(self):
return self.DRB
def GTGDR(self):
return self.GDR
def getStateRes(self):
return self.STRES
def getMeas(self):
return self.meas
###############################################################################
# Function
def is_date(string, fuzzy=False):
try:
parse(string, fuzzy=fuzzy)
return True
except ValueError:
return False
def immunte(Fname, Lname, DRB, GDR, driver):
# work on search button
driver.find_element_by_xpath("//*[#id='edittesttest']").click()
# work on
lastname = driver.find_element_by_id("LM")
lastname.clear()
lastname.send_keys(Lname)
# work on
firstname = driver.find_element_by_id("FN")
firstname.clear()
firstname.send_keys(Fname)
# work on
birthdate = driver.find_element_by_id("DRB")
birthdate.clear()
birthdate.send_keys(DRB)
# work on advanced search button to input GDR
try:
driver.find_element_by_xpath(
"//*[#id='queryResultsForm']/table/tbody/tr/td[2]/table/tbody/tr[3]/td/table/tbody/tr[2]/td[5]/input").click()
# work on GDR selection button
obj = Select(driver.find_element_by_name("OSC"))
if GDR == 'W':
obj.select_by_index(2)
elif GDR == 'S':
obj.select_by_index(1)
else:
obj.select_by_index(3)
# work on search button
driver.find_element_by_name("cmdFindClient").click()
# two scenarios could emerge as a search result: 1, not found 2, the found
if "No were found for the requested search criteria" in driver.find_element_by_id("queryResultsForm").text:
al = []
elif "the found" in driver.find_element_by_id("queryResultsForm").text:
# work on button
driver.find_element_by_xpath(
"//*[#id='queryResultsForm']/table[2]/tbody/tr[2]/td[2]/span/label").click()
# work on pt button
driver.find_element_by_id("redirect1").click()
# work on getting rid of opt out - header
header = driver.find_elements_by_class_name("large")[1].text
if "Access Restricted" in header:
print(Fname+' '+Lname+' '+" Opt out")
al = []
elif "Information" in header:
# find the first line
first = driver.find_element_by_xpath(
"//*[#id='container']/table[3]/tbody/tr/td[2]/table[2]/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[5]/td[1]").text
if (first == None):
al = []
else:
even = driver.find_elements_by_class_name("evenRow")
odd = driver.find_elements_by_class_name("oddRow")
o = []
e = []
for value in odd:
o.append(value.text)
for value in even:
e.append(value.text)
length = len(o)
i = 0
al = []
# merge odd and even row together and remove the row marked with complete
while i < length:
al.append(e[i])
al.append(o[i])
i = i+1
# parse each row of information with a comma, add group name for row that are without one
for x in range(len(al)):
if is_date(al[x][1:10]):
al[x] = al[x].replace(' ', ',')
al[x] = al[x].replace(',of,', ' of ')
al[x] = group + ',' + al[x][2:]
else:
al[x] = al[x].replace(' ', ',')
al[x] = al[x].replace(',of,', ' of ')
g = al[x].split(',', 1)
group = g[0]
# work on returning to home page
driver.find_element_by_xpath(
"//*[#id='headerMenu']/table/tbody/tr/td[2]/div/a").click()
except NoSuchElementException:
al = []
except WebDriverException:
al = []
return al
def main():
# Welcome message and input info
print('\nThis is the test.')
print('You will be prompted to type .')
print('If you need to exit the script and stop its process press \'CTRL\' + \'C\'.')
file = input("\nEnter file name: ")
user = input("\nEnter username: ")
pw = input("\nEnter password: ")
date = str(datetime.date.today())
# output file
fileOutputName = 'FILELIST' + \
date.replace('-', '_') + '.csv'
fileOutputNameNotFound = 'NOTFOUNDFILELIST' + \
date.replace('-', '_') + '.csv'
fileOutput = open(fileOutputName, 'w')
fileOutputNotFound = open(fileOutputNameNotFound, 'w')
fileOutput.write('MEAS_YR,MEMLFIDSK,MEMLFID,MEMB_FRST_NM,MEMLSTNM,' +
'DRB,GNDR,RSDNC_STATE,IMUN_RGSTRY_STATE,VCCN_GRP,VCCN_ADMN_DT,DOSE_SERIES,' +
'BRND_NM,DOSE_SIZE,RCTN\n')
fileOutputNotFound.write('MEAS_YR,MEMLFIDSK,MEMLFID,MEMB_FRST_NM,MEMLSTNM,MEMB_SUFFIX,' +
'DRB,GNDR,RSDNC_STATE,IMUN_RGSTRY_STATE,VCCN_GRP,VCCN_ADMN_DT,DOSE_SERIES,' +
'BRND_NM,DOSE_SIZE,RCTN\n')
# If the file exists
try:
os.path.isfile(file)
except:
print('File Not Found\n')
df = pd.read_excel(file)
# create array of People objects and member ID
peopleArray = []
memberIdArray = []
df.dropna()
total = len(df)
not_found = 0
found = 0
# assign each record in the data frame into Person class
for i in range(total):
measYr = str(df.loc[i, "MEAS_YR"])
MEMID = str(df.loc[i, "MEMLFID"])
MIDSK = str(df.loc[i, "MEMLFIDSK"])
fname = str(df.loc[i, "MEMLFID"])
lname = str(df.loc[i, "MEMLSTNM"])
inputDate = str(df.loc[i, "DRB"])
# If date is null then assign an impossible date
if not inputDate:
DRB = '01/01/1900'
if '-' in inputDate:
DRB = datetime.datetime.strptime(
inputDate, "%Y-%m-%d %H:%M:%S").strftime('%m/%d/%Y')
else:
DRB = datetime.datetime.strptime(
str(df.loc[i, "DRB"]), '%m/%d/%Y').strftime('%m/%d/%Y')
GDR = str(df.loc[i, "GDR"])
STRES = str(df.loc[i, "STATE_RES"])
meas = str(df.loc[i, "MEAS"])
p = Person(measYr, MEMID, MIDSK, fname, lname,
LNMSFX, DRB, GDR, STRES, meas)
# append array
m = df.loc[i, "MEMLFID"]
if (m not in memberIdArray):
peopleArray.append(p)
memberIdArray.append(m)
# work on setting up driver for md immunet - mac forward slash/windows double backward slash
PATH = os.getcwd()+'\\'+'chromedriver'
s = Service(PATH)
driver = webdriver.Chrome(service = s)
driver.get("https://www.wow2.pe.org/prd-IR/portalmanager.do")
# work on login ID
username = driver.find_element_by_id("userField")
username.clear()
username.send_keys(user)
# work on password
password = driver.find_element_by_name("password")
password.clear()
password.send_keys(pw)
# work on getting to home page - where loop will start
driver.find_element_by_xpath(
"//*[#id='loginButtonForm']/div/div/table/tbody/tr[3]/td[1]/input").click()
for n in range(total):
p = peopleArray[n]
recordToWrite = ''
print('Looking up: ' + str(n)+' ' +
p.GTLSNM() + ', ' + p.GTFSNM())
MeasYr = p.GTMESYR()
MIDSK = p.GTMEMSKY()
MEMID = p.GTMEMID()
Fname = p.GTFSNM()
Lname = p.GTLSNM()
DRB = str(p.GTDRB())
GDR = p.GTGDR()
STRES = p.getStateRes()
children = immunte(Fname, Lname, DRB, GDR, driver)
if children == []:
not_found += 1
recordToWrite = MeasYr+','+MIDSK+','+MEMID+',' + Fname + \
','+Lname + ',' + ' ' + ','+DRB+','+GDR+','+STRES+','+'MD'
fileOutputNotFound.write(recordToWrite + '\n')
elif children != []:
found += 1
for x in range(len(children)):
data_element = children[x].split(",")
# if the admin date is not valid
if is_date(data_element[1]) and is_date(data_element[3]):
children[x] = ''
elif is_date(data_element[1]) and data_element[2] == 'NOT' and data_element[3] == 'VALID':
children[x] = ''
elif is_date(data_element[1]) and is_date(data_element[3]) == False:
if data_element[5] != 'No':
data_element[4] = data_element[5]
data_element[5] = ''
children[x] = ','.join(data_element[0:6])
else:
data_element[5] = ''
children[x] = ','.join(data_element[0:6])
else:
children[x] = ''
for x in range(len(children)):
if children[x] != '':
recordToWrite = MeasYr+','+MIDSK+','+MEMID+',' + \
Fname+','+Lname + ','+DRB+','+GDR+','+STRES+','+'MD'
recordToWrite = recordToWrite+','+children[x]
fileOutput.write(recordToWrite + '\n')
n = +1
fileOutput.close()
fileOutputNotFound.close()
print('\n--------------------------------OUTPUT--------------------------------')
print("Script completed.")
##############################################################################
main()
You can try this in the 'immunte' function:
def immunte(Fname, Lname, DRB, GDR, driver):
al = []
# work on search button
driver.find_element_by_xpath("//*[#id='edittesttest']").click()
# work on
lastname = driver.find_element_by_id("LM")
lastname.clear()
lastname.send_keys(Lname)
# work on
firstname = driver.find_element_by_id("FN")
firstname.clear()
firstname.send_keys(Fname)
# work on
birthdate = driver.find_element_by_id("DRB")
birthdate.clear()
birthdate.send_keys(DRB)
# work on advanced search button to input GDR
try:
driver.find_element_by_xpath(
"//*[#id='queryResultsForm']/table/tbody/tr/td[2]/table/tbody/tr[3]/td/table/tbody/tr[2]/td[5]/input").click()
# work on GDR selection button
obj = Select(driver.find_element_by_name("OSC"))
if GDR == 'W':
obj.select_by_index(2)
elif GDR == 'S':
obj.select_by_index(1)
else:
obj.select_by_index(3)
# work on search button
driver.find_element_by_name("cmdFindClient").click()
# two scenarios could emerge as a search result: 1, not found 2, the found
if "No were found for the requested search criteria" in driver.find_element_by_id("queryResultsForm").text:
return al
if "the found" in driver.find_element_by_id("queryResultsForm").text:
# work on button
driver.find_element_by_xpath(
"//*[#id='queryResultsForm']/table[2]/tbody/tr[2]/td[2]/span/label").click()
# work on pt button
driver.find_element_by_id("redirect1").click()
# work on getting rid of opt out - header
header = driver.find_elements_by_class_name("large")[1].text
if "Access Restricted" in header:
print(Fname+' '+Lname+' '+" Opt out")
return al
if "Information" in header:
# find the first line
first = driver.find_element_by_xpath(
"//*[#id='container']/table[3]/tbody/tr/td[2]/table[2]/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[5]/td[1]"
).text
if (first == None):
return al
even = driver.find_elements_by_class_name("evenRow")
odd = driver.find_elements_by_class_name("oddRow")
o = []
e = []
for value in odd:
o.append(value.text)
for value in even:
e.append(value.text)
length = len(o)
i = 0
al = []
# merge odd and even row together and remove the row marked with complete
while i < length:
al.append(e[i])
al.append(o[i])
i = i+1
# parse each row of information with a comma, add group name for row that are without one
for x in range(len(al)):
if is_date(al[x][1:10]):
al[x] = al[x].replace(' ', ',')
al[x] = al[x].replace(',of,', ' of ')
al[x] = group + ',' + al[x][2:]
else:
al[x] = al[x].replace(' ', ',')
al[x] = al[x].replace(',of,', ' of ')
g = al[x].split(',', 1)
group = g[0]
# work on returning to home page
driver.find_element_by_xpath(
"//*[#id='headerMenu']/table/tbody/tr/td[2]/div/a").click()
except NoSuchElementException:
pass
except WebDriverException:
pass
return al

Nested while loop to extract text data

I am trying to get data from a txt file and put it into a dataframe. Text file looks something like this:
******************************************************************************************************************************
DATE BUY:2018/05/26
****************************************************************************************************************************
STORE: DUBIDUBI SAILOR: 123456
***********************************************************************************************************************
< CLIENT >
NAME CLIENT MEMBER TYPE MEMBER NUMBER: 89101112
ANTONY STARK 1
<PRODUCTS>
NUM-PRODUCTS
6
< ADDRESS TO SEND>
186 FLEET STREET
-----------------------------------------------------------------------------------------------------------------------
< CLIENT >
NAME CLIENT MEMBER TYPE MEMBER NUMBER: 13141516
THOR 2
<PRODUCTS>
NUM-PRODUCTS
2
< ADDRESS TO SEND>
1800 PENNSYLVANIA STREET
<SERVICES>
NUM-SERVICE TYPE OF SERVICE
64 DEVOLUTION
*****************************************************************************************************************************
I want to get a dataframe containing a list of all the clients information that were assisted by same sailor in same store.
What does works: below code reads the text line by line and extract the information of each line.
data []
global STORE, DATE_BUY, SAILOR, CLIENT, MEMBER_NUM, NUM_PRODUCTS, ADDRESS, NUM_SERVICE, TYPE_MEMB, TYPE_SERV
STORE = ""
DATE_BUY = ""
SAILOR = ""
CLIENT = ""
MEMBER_NUM = ""
NUM_PRODUCTS = ""
ADDRESS = ""
NUM_SERVICE = ""
TYPE_MEMB = ""
TYPE_SERV = ""
with open ('myfile', 'r') as txt_file:
read_file = txt_file.readlines()
for i in range(0, len(read_file)):
line = read_file[i]
z = line[0:50]
a = line[0:9]
b = line[0:42]
c = line[112:132]
d = line[0:14]
e = line[0:14]
dif_client = line[0:58]
if a == " STORE":
STORE = line[10:28]
SAILOR = line[30:45]
elif c == " DATE BUY":
DATE_BUY = line[133:145]
elif b == " NAME CLIENT"
nextline = read_file[i + 1]
CLIENT = nextline[0:57]
MEMBER_NUM = nextline[96:126]
TYPE_MEMB = nextline[79:86]
elif d == " < ADDRESS":
nextline = read_file[i + 1]
ADDRESS = nextline[0:63]
elif e == " < PRODUCTS":
nextline = read_file[i + 1]
NUM_PRODUCTS = nextline[0:24]
elif f == " <SERVICES":
nextline = read_file[i + 1]
NUM_SERVICE = nextline[]
TYPE_SERV = nextline[]
data.append({'Store':STORE, 'Sailor':SAILOR, 'Date_Buy':DATE_BUY, 'Client':CLIENT, 'Member_Num':MEMBER_NUM,
'Type_Memb':TYPE_MEMB, 'Address':ADDRESS, 'Products':NUM_PRODUCTS,'Num_Serv':NUM_SERVICE, 'Type_Serv':TYPE_SERV})
df = pd.DataFrame(data)
What does NOT works: when using a nested while loop to extract information of each client assisted by a sailor my code simply does not end running. The code that does not work is:
data []
global STORE, DATE_BUY, SAILOR, CLIENT, MEMBER_NUM, NUM_PRODUCTS, ADDRESS, NUM_SERVICE, TYPE_MEMB, TYPE_SERV
STORE = ""
DATE_BUY = ""
SAILOR = ""
CLIENT = ""
MEMBER_NUM = ""
NUM_PRODUCTS = ""
ADDRESS = ""
NUM_SERVICE = ""
TYPE_MEMB = ""
TYPE_SERV = ""
with open ('myfile', 'r') as txt_file:
read_file = txt_file.readlines()
for i in range(0, len(read_file)):
line = read_file[i]
z = line[0:50]
a = line[0:9]
b = line[0:42]
c = line[112:132]
d = line[0:14]
e = line[0:14]
dif_client = line[0:58]
while dif_client != " < CLIENT >":
if a == " STORE":
STORE = line[10:28]
SAILOR = line[30:45]
elif c == " DATE BUY":
DATE_BUY = line[133:145]
elif b == " NAME CLIENT"
nextline = read_file[i + 1]
CLIENT = nextline[0:57]
MEMBER_NUM = nextline[96:126]
TYPE_MEMB = nextline[79:86]
elif d == " < ADDRESS":
nextline = read_file[i + 1]
ADDRESS = nextline[0:63]
elif e == " < PRODUCTS":
nextline = read_file[i + 1]
NUM_PRODUCTS = nextline[0:24]
elif f == " <SERVICES":
nextline = read_file[i + 1]
NUM_SERVICE = nextline[]
TYPE_SERV = nextline[]
data.append({'Store':STORE, 'Sailor':SAILOR, 'Date_Buy':DATE_BUY, 'Client':CLIENT, 'Member_Num':MEMBER_NUM,
'Type_Memb':TYPE_MEMB, 'Address':ADDRESS, 'Products':NUM_PRODUCTS,'Num_Serv':NUM_SERVICE, 'Type_Serv':TYPE_SERV})
df = pd.DataFrame(data)
The desired output should look something like this. I know that each client information comes when the word < CLIENT > appears in text.
Date_buy Store Sailor Client Member_Number Num_Products Address_to_send num_Service type_serv
2018/05/26 dubidubi 123456 ANTONY STARK 89101112 6 186 FLEET STREET
2018/05/26 dubidubi 123456 THOR 13141516 2 1800 PENNSYLVANIA STREET 64 DEVOLUTION
Thanks for the description. The problem is in the infinite loop you built:
dif_client = line[0:58]
while dif_client != " < CLIENT >":
if a == " STORE":
...
dif_client doesn't change within the loop. There is no break or other exit, only the while condition (which is good design). Therefore, once you get into the loop, you have no way to leave: dif_client is constant.
Your logic is incorrect: you have two loops that are trying to walk through the lines of the file:
for i in range(0, len(read_file)):
...
while dif_client != " < CLIENT >":
# Process one line
The body of the while is designed to process one line. When you're done with that, you need to go to the next iteration of the for to get the next line. Finding a CLIENT line is a if decision, not a loop.

Python 3.6 Lambda code error for os.environ variable

try :
sf = Salesforce(username = sfdc_username,
password = sfdc_password,
security_token = sfdc_security_token,
instance_url = sfdc_salesforce_instance_url,
domain = sfdc_sandbox)
print('salesforce login good')
except (SalesforceGeneralError,
SalesforceMoreThanOneRecord,
SalesforceMalformedRequest,
SalesforceExpiredSession,
SalesforceRefusedRequest,
SalesforceResourceNotFound) as e :
print(e.content[0]['message'])
sys.exit(1)
this portion of code on lambda is failing with the error:
a bytes-like object is required, not 'str': TypeError
Traceback (most recent call last):
File "/var/task/sfdc_etl/bin/sfdc_etl.py", line 80, in lambda_handler
domain = sfdc_sandbox)
File "/var/task/sfdc_etl/lib/python3.6/site-packages/simple_salesforce/api.py", line 146, in __init__
domain=self.domain)
File "/var/task/sfdc_etl/lib/python3.6/site-packages/simple_salesforce/login.py", line 80, in SalesforceLogin
username = escape(username)
File "/var/lang/lib/python3.6/html/__init__.py", line 19, in escape
s = s.replace("&", "&") # Must be done first!
TypeError: a bytes-like object is required, not 'str'
When I move this code to my test environment on an EC2 amazon linux and set the sfdc_sandox to 'test' in line, it works with no issues. I tried using os.environb["L_SFDC_SANDBOX"] and os.environ["L_SFDC_SANDBOX"].encode('utf8'), but that also did not help as it gave the same error. How do I fix the type error when I pull in this variable in Lambda?
Here is the entire script, maybe the error isn't because of that specific piece of code even though it seems like it is.
import os
import sys
# this adds the parent directory of bin so we can find the module
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
sys.path.append(parent_dir)
#This addes venv lib/python2.7/site-packages/ to the search path
mod_path = os.path.abspath(parent_dir+"/lib/python"+str(sys.version_info[0])+"."+str(sys.version_info[1])+"/site-packages/")
sys.path.append(mod_path)
from awsmgr.awsmgr import S3Helper
from base64 import b64decode
import boto3
from collections import OrderedDict
import datetime
from dateutil.parser import parse
import logging
import json
import math
import pandas as pd
from simple_salesforce import Salesforce, SalesforceLogin
from simple_salesforce.exceptions import SalesforceGeneralError, SalesforceMoreThanOneRecord, SalesforceMalformedRequest, SalesforceExpiredSession, SalesforceRefusedRequest, SalesforceResourceNotFound
from sqlalchemy import create_engine
from sqlalchemy import exc
current_path = os.path.dirname(os.path.realpath(__file__))
# Use this one for the parent directory
ENV_ROOT = os.path.abspath(os.path.join(current_path, os.path.pardir))
# Use this one for the current directory
#ENV_ROOT = os.path.abspath(os.path.join(current_path))
sys.path.append(ENV_ROOT)
def lambda_handler(event, context):
###############################
# Global Variable Definitions #
###############################
d_parse = parse
TMP_PATH = '/tmp'
igersUser = 'admin'
igersPwd = boto3.client('kms').decrypt(CiphertextBlob=b64decode(os.environ["RS_PASSWORD"]))['Plaintext']
igersHost = os.environ["RS_HOST"]
igers = create_engine('postgres://{}:{}#{}/ibdrs'.format(igersUser, igersPwd, igersHost), encoding="utf-8")
igersSchema = os.environ["RS_SCHEMA"]
s3 = S3Helper(debug=os.environ["DEBUG"])
nextObjFile = s3.get_s3_file('s3://test-sfdc-sds-team/sfdc-etl-jp-test/sfdc_etl/objects/next_object.txt',os.path.abspath(os.path.join(TMP_PATH,'next_object.txt')))
s3Destination = 's3://test-sfdc-sds-team/sfdc-etl-jp-test/sfdc_etl/json/'
s3Path = '{}_json'
s3NextObjDestination = 's3://test-sfdc-sds-team/sfdc-etl-jp-test/sfdc_etl/objects/{}'
fileCount = 1
sfdc_username = os.environ["L_USERNAME"].encode('utf8')
sfdc_salesforce_instance_url = os.environ["L_SALESFORCE_INSTANCE_URL"].encode('utf8')
sfdc_password = boto3.client('kms').decrypt(CiphertextBlob=b64decode(os.environ["L_PASSWORD"]))['Plaintext']
sfdc_security_token = boto3.client('kms').decrypt(CiphertextBlob=b64decode(os.environ["L_SECURITY_TOKEN"]))['Plaintext']
sfdc_sandbox = os.environ["L_SFDC_SANDBOX"].encode('utf8')
print(type(sfdc_username), type(sfdc_password), type(sfdc_security_token), type(sfdc_salesforce_instance_url), type(sfdc_sandbox))
try :
sf = Salesforce(username = sfdc_username,
password = sfdc_password,
security_token = sfdc_security_token,
instance_url = sfdc_salesforce_instance_url,
domain = sfdc_sandbox)
print('salesforce login good')
except (SalesforceGeneralError,
SalesforceMoreThanOneRecord,
SalesforceMalformedRequest,
SalesforceExpiredSession,
SalesforceRefusedRequest,
SalesforceResourceNotFound) as e :
print(e.content[0]['message'])
sys.exit(1)
# get nextobj from s3
with open(nextObjFile, 'r') as f :
nextObjItem = f.read().strip().lower()
nextObj = nextObjItem.lower()
print('Processing {}'.format(nextObj))
######################################################
# get rs table group permissions, store in dataframe #
######################################################
def rsGetGroupPerms(igers, nextObj) :
global groupPerms
groupPerms = {}
existingGroupPerms = '''
SELECT
namespace, item, type, groname
FROM
(
SELECT
use.usename AS subject,
nsp.nspname AS NAMESPACE,
cls.relname AS item,
cls.relkind AS TYPE,
use2.usename AS OWNER,
cls.relacl
FROM
pg_user use
CROSS JOIN pg_class cls
LEFT JOIN pg_namespace nsp ON cls.relnamespace = nsp.oid
LEFT JOIN pg_user use2 ON cls.relowner = use2.usesysid
WHERE
cls.relowner = use.usesysid
AND nsp.nspname NOT IN ( 'pg_catalog', 'pg_toast', 'information_schema' )
AND nsp.nspname IN ( 'salesforce' )
AND relacl IS NOT NULL
ORDER BY
subject,
NAMESPACE,
item
)
JOIN pg_group pu ON array_to_string( relacl, '|' ) LIKE'%%' || pu.groname || '%%'
WHERE item = '{}'
'''.format(nextObj)
groupPerms = pd.read_sql(existingGroupPerms, igers)
print('got the group permissions')
return groupPerms
#####################################################
# get rs table user permissions, store in dataframe #
# NOT CURRENTLY IN USE #
#####################################################
#def rsGetUseerPerms(igers, nextObj) :
# existingUserPerms = '''
# SELECT *
# FROM
# (
# SELECT
# schemaname
# ,objectname
# ,usename
# ,HAS_TABLE_PRIVILEGE(usrs.usename, fullobj, 'select') AND has_schema_privilege(usrs.usename, schemaname, 'usage') AS sel
# ,HAS_TABLE_PRIVILEGE(usrs.usename, fullobj, 'insert') AND has_schema_privilege(usrs.usename, schemaname, 'usage') AS ins
# ,HAS_TABLE_PRIVILEGE(usrs.usename, fullobj, 'update') AND has_schema_privilege(usrs.usename, schemaname, 'usage') AS upd
# ,HAS_TABLE_PRIVILEGE(usrs.usename, fullobj, 'delete') AND has_schema_privilege(usrs.usename, schemaname, 'usage') AS del
# ,HAS_TABLE_PRIVILEGE(usrs.usename, fullobj, 'references') AND has_schema_privilege(usrs.usename, schemaname, 'usage') AS ref
# FROM
# (
# SELECT schemaname, 't' AS obj_type, tablename AS objectname, schemaname + '.' + tablename AS fullobj FROM pg_tables
# UNION
# SELECT schemaname, 'v' AS obj_type, viewname AS objectname, schemaname + '.' + viewname AS fullobj FROM pg_views
# ) AS objs
# ,(SELECT * FROM pg_user) AS usrs
# ORDER BY fullobj
# )
# WHERE (sel = true or ins = true or upd = true or del = true or ref = true)
# and objectname = '{}'
# '''.format(nextObj)
#
# userPerms = pd.read_sql_query(existingUserPerms, igers)
# return userPerms
####################################################
# Connect to Salesforce, Query JSON, and Copy to S3#
####################################################
def sfToS3(fileCount, sf, nextObj) :
# Initiate list for returned data
pulls = []
# Pull initial Query
sfobject = sf.restful('sobjects/{}/describe/'.format(nextObj), params=None)
fields_list = [record['name'] for record in sfobject['fields']]
initialQuery = sf.query("SELECT {} FROM {}".format(','.join(fields_list),nextObj))
#Send a single file or the first file to S3
data = initialQuery['records']
try :
send_temp_jsonl_to_s3(data, nextObj, s3, s3Destination, fileCount, s3Path)
# Append initial query data to pulls
if 'nextRecordsUrl' in initialQuery :
pulls.append(initialQuery['nextRecordsUrl'])
nextChunk = initialQuery['nextRecordsUrl']
nextQuery = sf.query_more(nextChunk,True)
if 'nextRecordsUrl' in nextQuery :
pulls.append(nextQuery['nextRecordsUrl'])
x = True
fileCount = 2
while x == True:
try:
# set up while loop to re-query salesforce until returned
# query does not have a 'nextRecordsUrl' return value
# Query new 'nextRecordsUrl'
nextQuery = sf.query_more(nextQuery['nextRecordsUrl'],True)
# append new query to pulls
pulls.append(nextQuery['nextRecordsUrl'])
except: # This triggers when nextQuery['nextRecordsUrl'] does not exist
# set x to False to end loop
x = False
#if there was a follow on set of records, query it and add to S3
if len(pulls) >= 1 :
for i in range(len(pulls)) :
data = sf.query_more(str(pulls[i].split('/')[5]))['records']
send_temp_jsonl_to_s3(data, nextObj, s3, s3Destination, fileCount, s3Path)
fileCount += 1
print('completed sending JSON files to S3')
except :
print('Salesforce Object Empty, ending execution')
updateNextObj(nextObj, s3NextObjDestination)
sys.exit(1)
####################
# JSONL to S3 #
####################
def send_temp_jsonl_to_s3(data, nextObj, s3, s3Destination, fileCount, s3Path) :
fileName = '{}_file{}.json'
localFilePath ='/tmp/'
for element in data :
item = data.pop()
item.pop('attributes', None)
tempdict = OrderedDict({})
for k,v in item.items() :
if 'date' in k.lower() or 'stamp' in k.lower() :
if not v is None :
d = d_parse(v)
v = d.strftime('%Y-%m-%d %I:%M:%S')
tempdict[k.lower()] = v
else :
tempdict[k.lower()] = v
with open(localFilePath+fileName.format(nextObj,fileCount), 'a') as outfile :
outfile.write(json.dumps(tempdict))
outfile.write('\n')
s3.put_s3_file_datedpath(localFilePath+fileName.format(nextObj,fileCount),s3Destination+s3Path.format(nextObj))
os.remove(localFilePath+fileName.format(nextObj,fileCount))
#################################################
# maps SFDC type to SQL type - used for ddl #
#################################################
def map_data_type(sfdc_type, length):
"""
Definition to map Salesforce datatype to Redshift datatype.
"""
__MULTIPLIER = 1.3 # may not be Zero!
if length == 0:
length = 1092
if length == 4095:
length = 15000
if length > 65535:
length = 65534
if sfdc_type == u'boolean':
return u'varchar(5)'
elif sfdc_type == u'date':
return u'timestamp'
elif sfdc_type == u'datetime':
return u'timestamp'
elif sfdc_type == u'currency':
return u'decimal(38,6)'
elif sfdc_type == u'double':
return u'decimal(38,6)'
elif sfdc_type == u'int':
return u'numeric(10)'
elif sfdc_type == u'picklist':
return u'varchar({})'.format(length)
elif sfdc_type == u'id':
return u'varchar({})'.format(length)
elif sfdc_type == u'reference':
return u'varchar({})'.format(length)
elif sfdc_type == u'textarea':
if length >= (65535/length*__MULTIPLIER):
return u'varchar({})'.format(65534)
else:
return u'varchar({})'.format( math.ceil(length*__MULTIPLIER))
elif sfdc_type == u'email':
return u'varchar({})'.format(length)
elif sfdc_type == u'phone':
return u'varchar({})'.format(length)
elif sfdc_type == u'url':
return u'varchar({})'.format(length)
elif sfdc_type == u'multipicklist':
return u'varchar({})'.format(length)
elif sfdc_type == u'anyType':
if length >= 65535:
return u'varchar({})'.format(65534)
else:
return u'varchar({})'.format(math.ceil(length*__MULTIPLIER))
elif sfdc_type == u'percent':
return u'numeric(38,6)'
elif sfdc_type == u'combobox':
return u'varchar({})'.format(length)
elif sfdc_type == u'base64':
return u'varchar({})'.format(length)
elif sfdc_type == u'time':
return u'varchar(255)'
elif sfdc_type == u'string':
if length >= 65535:
return u'varchar({})'.format(65534)
else:
return u'varchar({})'.format(math.ceil(length*__MULTIPLIER))
else:
return u'varchar(65535)'
####################################
# Turn SFDC metadata into SQL #
####################################
def get_ddl(sf, nextObj, igersSchema, col_remove=None):
md = sf.restful("sobjects/{}/describe/".format(nextObj), params=None)
target_table=nextObj
total_field_count = 0
global ddl_str
ddl_str = ''
ddl_str += 'CREATE TABLE '+ igersSchema+"."+target_table +' ('
for x in md["fields"]:
#print x["name"]
if col_remove:
if x["name"].lower() in [element.lower() for element in col_remove]:
print("Skipping: {}".format(x["name"]))
continue
ddl_str += x["name"] + ' ' + map_data_type(x["type"],x["length"])
if x["name"] == 'Id':
ddl_str += ' NOT NULL DISTKEY'
ddl_str += ","
total_field_count = total_field_count + 1
ddl_str = ddl_str[:-1]
ddl_str += ')'
logging.info('DDL Successfully created...')
# print("Total Field Count: "+str(total_field_count))
return ddl_str
#########################
# Create Table from DDL, execute the copy query and update permissions #
#########################
def rs_operations(ddl_str, groupPerms, igersSchema, nextObj, s3Destination, s3Path, igers) :
today = datetime.date.today()
dated_path = today.strftime('%Y/%m/%d')
perms_statement = ''
drop_table = '''
DROP TABLE IF EXISTS {}.{} CASCADE
'''.format(igersSchema, nextObj)
loadQuery = '''
COPY {}.{}
FROM '{}{}/{}/'
iam_role 'arn:aws:iam::087024238921:role/LambdaFullAccessRole'
TRUNCATECOLUMNS
FORMAT AS JSON 'auto'
'''.format(igersSchema, nextObj, s3Destination, s3Path.format(nextObj), dated_path)
grantPerms = '''
GRANT SELECT ON {}.{} TO GROUP {}
'''
with igers.connect() as conn:
try :
conn.execute(drop_table)
print('completed drop table')
conn.execute(ddl_str)
print('completed create table')
conn.execute(loadQuery)
print('completed load query')
for row in range(len(groupPerms)) :
perms_statement = grantPerms.format(groupPerms['namespace'].iloc[row],groupPerms['item'].iloc[row],groupPerms['groname'].iloc[row])
conn.execute(perms_statement)
print('completed grant group permissions')
conn.close()
except exc.SQLAlchemyError as e :
print(e)
######################################
# Update Next Object and Write to S3 #
######################################
def updateNextObj(nextObj, s3NextObjDestination) :
objectsList = []
objectsFile = s3.get_s3_file('s3://test-sfdc-sds-team/sfdc-etl-jp-test/sfdc_etl/objects/sfdc_etl_objects.txt',os.path.abspath(os.path.join(TMP_PATH,'sfdc_etl_objects.txt')))
localNobjTempFile = os.path.abspath(os.path.join(TMP_PATH,'next_object.txt'))
nextObjText = ''
with open (objectsFile, 'r') as objs :
for line in objs :
objectsList.append(line.strip("\n"))
for i in range(len(objectsList)-1) :
if objectsList[i].lower() == nextObj :
nextObjText = objectsList[i+1]
print(nextObjText)
with open (localNobjTempFile, 'w') as f :
f.write(nextObjText)
s3.put_s3_file(localNobjTempFile,s3NextObjDestination.format('next_object.txt'))
print('completed Updating the next object')
################################################
# Test if the object exists and execute #
################################################
try :
getattr(sf,nextObj).describe()
except (SalesforceGeneralError,
SalesforceMoreThanOneRecord,
SalesforceMalformedRequest,
SalesforceExpiredSession,
SalesforceRefusedRequest,
SalesforceResourceNotFound) as e :
print(e.content[0]['message'] +', writing next object and ending')
updateNextObj(nextObj, s3NextObjDestination)
sys.exit(1)
rsGetGroupPerms(igers, nextObj)
sfToS3(fileCount, sf, nextObj)
get_ddl(sf, nextObj, igersSchema, col_remove=None)
rs_operations(ddl_str, groupPerms, igersSchema, nextObj, s3Destination, s3Path, igers)
updateNextObj(nextObj, s3NextObjDestination)
The problem is that the following line
sfdc_password = boto3.client('kms').decrypt(CiphertextBlob=b64decode(os.environ["L_PASSWORD"]))['Plaintext']
is returning a bytes object, and in the other hand, the Salesforce class expects a string.
Solution:
try :
sf = Salesforce(username=sfdc_username,
password=sfdc_password.decode("utf-8"),
security_token=sfdc_security_token,
instance_url=sfdc_salesforce_instance_url,
domain=sfdc_sandbox)
print('salesforce login good')
Note that the sfdc_password variable is being converted to string using the .decode bytes method

string.lower() in contents.lower() not working correctly

My function looks up WHOIS requests using sockets and proxies.
Sometimes there is an issue with a proxy and it returns no data so I check to see if the data contains the initial domain request and if it does, return the result.
However sometimes this returns TRUE even when there's nothing in the string data.
I have also tried to do len(data) > 25, etc but for some reason it can still return true.
if domain.lower() in data.lower():
obj = WhoisEntry(domain, data)
logger.debug('WHOIS success ' + domain + ': ' + data)
return {
'expiration_date': str(obj.expiration_date),
'status': str(obj.status),
'registrar': str(obj.registrar)
}
Full code
def whois_tcp(domain):
whois_servers = [ 'whois.verisign-grs.com', 'whois.internic.net', 'whois.crsnic.net' ]
attempts = 0
while attempts < 15:
attempts = attempts + 1
logger.debug('WHOIS attempt '+domain+': '+str(attempts))
whois_host = random.choice(whois_servers)
proxy = random.choice(proxies) # global variable from config.py
proxy = proxy.split(':')
try:
s = socks.socksocket()
s.setproxy(socks.PROXY_TYPE_SOCKS5, proxy[0], int(proxy[1]))
s.connect((whois_host, 43))
s.send(domain + '\n\r\n')
data = ''
buf = s.recv(1024)
while len(buf):
data += buf
buf = s.recv(1024)
s.close()
#if domain.lower() not in data.lower():
# raise Exception(domain, 'Domain not found in WHOIS: '+data)
# continue
if domain.lower() in data.lower():
obj = WhoisEntry(domain, data)
logger.debug('WHOIS success ' + domain + ': ' + data)
return {
'expiration_date': str(obj.expiration_date),
'status': str(obj.status),
'registrar': str(obj.registrar)
}
except Exception, e:
logger.error('WHOIS Lookup Failed: '+str(e))
return None
What am I doing wrong?
1.Try:
if str(domain).lower() in str(data).lower():
...
2.Check whether the value of domain variable is None or ''.
I added code to write each whois response to file and noticed that it was in fact including whois results
file = open('./whois_records/'+domain, 'w')
file.write(data)
file.close()
Not sure why this line of code wasn't outputting the data string in the log file:
logger.debug('WHOIS success ' + domain + ': ' + str(data))
I have found that different whois servers are giving me different results for lookups.
For example: bpi-group.com
1. whois.verisign-grs.com: **FREE**
2. whois.gandi.net: **TAKEN**
Additionally I need to use different whois server for different TLDs (com/net/org/info/biz)
Currently now using:
- tld.whois-servers.net
Code is still not perfect but is working better and original problem has been solved:
def whois_tcp(domain):
ext = tldextract.extract(domain)
if ext.suffix == 'org':
whois_servers = [ 'org.whois-servers.net' ]
elif ext.suffix == 'biz':
whois_servers = [ 'biz.whois-servers.net' ]
elif ext.suffix == 'info':
whois_servers = [ 'info.whois-servers.net' ]
elif ext.suffix == 'com':
whois_servers = [ 'com.whois-servers.net' ]
elif ext.suffix == 'net':
whois_servers = [ 'net.whois-servers.net' ]
else:
whois_servers = [ 'whois.verisign-grs.com', 'whois.internic.net', 'whois.crsnic.net' ]
attempts = 0
result = None
while attempts < 15:
attempts = attempts + 1
logger.debug('WHOIS attempt '+domain+': '+str(attempts))
whois_host = random.choice(whois_servers)
proxy = random.choice(proxies) # global variable from config.py
proxy = proxy.split(':')
try:
s = socks.socksocket()
s.setproxy(socks.PROXY_TYPE_SOCKS5, proxy[0], int(proxy[1]))
s.connect((whois_host, 43))
s.send(domain + '\n\r\n')
data = ''
buf = s.recv(1024)
while len(buf):
data += buf
buf = s.recv(1024)
s.close()
file = open('./whois_records/'+domain, 'w')
file.write(data)
file.close()
# if domain.lower() not in data.lower():
# raise Exception(domain, 'Domain not found in WHOIS: '+data)
# continue
if str(domain).lower() in str(data).lower():
obj = WhoisEntry.load(domain, data)
logger.debug('WHOIS success ' + domain + ': ' + str(data))
return {
'expiration_date': str(obj.expiration_date),
'status': str(obj.status),
'registrar': str(obj.registrar)
}
except Exception, e:
logger.error('WHOIS Lookup Failed: '+str(e))
return None

Trouble with recursive text splitting

trying to split text via text-defined boundary markers using recursion and create a list of lists and strings containing all of the organized parts of the original text file.
The split isn't happening.
Here is the short version: The real problem script:
def separate(text,boundary = None):
if boundary == None:
m = re.findall(r'(?<=boundary=).*',text)
i = 0
while i < len(m): #have all levels of Boundary/headers named
boundary = m[i]
textList = recursiveSplit(text,boundary)
i += 1
pdb.set_trace()
return textList
def recursiveSplit(chunk,boundary):
if type(chunk) is types.StringType:
ar = re.split(r'(?P<boundary>)(?!--)',chunk)
return ar
if type(chunk) is types.ListType:
i = 0
while i < len(chunk):
chunk[i] = recursiveSplit(chunk[i],boundary)
i += 1
return obj
I've posted this script before and people wanted me to post it in its entirety so I'll do that
#Textbasics email parser
#based on a "show original" file converted into text
from sys import argv
import re, os, pdb, types
script, filename = argv
text = open(filename).read()
type = "text only" #Set the default type of email
#cut the email up by sections
#--A section is defined as any time there are two line breaks in a row
textList = re.split(r"\n\n", text)
header = textList[0]
if re.search(r'MIME-Version',header):
type = "MIME"
# If mail has no attachments, parse as a text-only email
class Parser(object):
def __init__(self,textList):
a = 1
self.body = ""
self.textList = textList
self.header = textList[0]
while a < len(textList):
self.body = self.body + textList[a] + '\n\n'
a += 1
m = re.search(r'(?<=Subject: ).*', self.header)
self.subject = m.group(0)
m = re.search(r'(?<=From: ).*', self.header)
self.fromVar = m.group(0)
m = re.search(r'(?<=To: ).*', self.header)
self.toVar = m.group(0)
m = re.search(r'(?<=Date: )\w+\s\w+\s\w+', self.header)
self.date = m.group(0)
def returnParsed(self,descriptor = "all"):
if descriptor == "all":
retv = "Subject: " + self.subject + "\n" + "From: " + self.fromVar + "\n" + "To: " + self.toVar + "\n" + "Date: " + self.date + "\n" + "\n" + self.body
return retv
if descriptor == "subject":
return self.subject
if descriptor == "fromVar":
return self.fromVar
if descriptor == "toVar":
return self.toVar
if descriptor == "date":
return self.date
if descriptor == "body":
return self.body
class MIMEParser(Parser):
class MIMEDataDecoder(object):
def __init__(self,decodeString,type):
pass
def __init__(self,textList):
self.textList = textList
self.nestedItems = []
newItem = NestedItem(self)
newItem.setContentType("Header")
newItem.setValue(self.textList[0])
self.nestedItems.append(newItem)
if re.search(r'(boundary=)',newItem.value):
helperItem = NestedItem(self)
helperItem.value = (self.textList[0])
m = re.search(r'(?<=Content-Type: ).+(?=;)',newItem.value)
helperItem.setContentType(m.group(0))
self.nestedItems.append(helperItem)
self.organizeData()
"""i = 0
while i < len(self.textList):
newItem = NestedItem(self)
ct = self.nextContentType
newItem.setContentType(ct)
newItem.setValue(self.textList[i])
self.nestedItems.append(newItem)
m = re.search(r'(?<=Content-Type: ).+(?=;)',self.textList[i])
if m:
self.nextContentType = m.group(0)
i += 1
"""
def nestItem (self,item):
self.nestedItems.append(item)
def organizeData(self):
self.nestLevel = 1
self.currentSuper = self
m = re.search(r'(?<=boundary=).*',self.textList[0])
self.currentBoundary = m.group(0)
self.currentList = self.textList
self.currentList.remove(self.textList[0])
self.formerObjectDatabase = {}
pdb.set_trace()
while self.nestLevel > 0:
i = 0
while i < len(self.currentList):
boundary = self.currentBoundary
#If block is a "normal block", containing a current boundary identifier
p = re.search(r'--(?P<boundary>)(?!--)', text)
if p:
newItem = NestedItem(self.currentSuper)
newItem.setValue(self.currentList[i])
r = re.search(r'(?<=Content-Type: ).+(?=;)',newItem.value)
if r:
newItem.setContentType(r.group(0))
self.currentObject = newItem
self.currentSuper.nestItem(self.currentObject)
#If the block contains a new block boundary
m = re.search(r'(?<=boundary=).*',self.currentList[i])
if m:
#begin new layer of recursive commands
newFormerObject = self.FormerCurrentObject(self.currentList,self.currentSuper,self.currentBoundary)
self.formerObjectDatabase[self.nestLevel] = newFormerObject
self.currentSuper = self.currentObject
self.nestLevel += 1
self.currentBoundary = m.group(0)
boundary = self.currentBoundary
#self.currentList = re.split(r'--(?P<boundary>)(?!--)', self.currentList[i])
boundary = self.currentBoundary
#If block contains an "end of boundary" marker
q = re.search(r'(?P<boundary>)--', text)
if q:
self.nestLevel -= 1
currentObject = self.formerObjectDatabase[self.nestLevel]
self.currentList = currentObject.formerList
self.currentSuper = currentObject.formerSuper
self.currentBoundary = currentObject.formerBoundary
i += 1
class FormerCurrentObject:
def __init__(self,formerList,formerSuper,formerBoundary):
self.formerList = formerList
self.formerSuper = formerSuper
self.formerBoundary = formerBoundary
def printAll(self):
print "printing all: %d" % len(self.nestedItems)
i = 0
while i < len(self.nestedItems):
print "printing out item %d" % i
self.nestedItems[i].printOut()
i += 1
class NestedItem(object):
def __init__(self,superObject,contentType=" ",value = " "):
self.superObject = superObject
self.contentType = contentType
self.value = value
self.nestedItems = []
def nestItem(self,item):
self.nestedItems.append(item)
def printOut(self,printBuffer = ""):
print printBuffer + '++%s' % self.contentType
print printBuffer + self.value
a = 0
printBuffer = printBuffer + " "
while a < len(self.nestedItems):
self.nestedItems[a].printOut(printBuffer)
def setContentType(self,contentType):
self.contentType = contentType
def setValue(self,value):
self.value = value
if type == "text only":
p = Parser(textList)
print p.returnParsed()
# ---PROBLEM CODE STARTS HERE---
def separate(text,boundary = None):
pdb.set_trace()
if boundary == None:
m = re.findall(r'(?<=boundary=).*',text)
i = 0
textList = [text]
while i < len(m): #have all levels of Boundary/headers named
boundary = m[i]
textList = recursiveSplit(textList,boundary)
i += 1
return textList
def recursiveSplit(chunk,boundary):
if type(chunk) is types.ListType: #<<--error occurs here
for obj in chunk:
recursiveSplit(obj,boundary)
if type(chunk) is types.StringType:
list = re.split(r'(?P<boundary>)(?!--)',chunk)
return list
return None
#---PROBLEM CODE ENDS(?) HERE---
if type == "MIME":
#separate the text file instead by its boundary identifier
p = MIMEParser(separate(text))
p.printAll()
You can use any MIME type email for this to run. Here's the one I've been using for convenience
MIME-Version: 1.0
Received: by 10.112.170.40 with HTTP; Fri, 3 May 2013 05:08:21 -0700 (PDT)
Date: Fri, 3 May 2013 08:08:21 -0400
Delivered-To: MYEMAIL#gmail.com
Message-ID: <#mail.gmail.com>
Subject: MiB 5/3/13 7:43AM (EST)
From: ME<MYEMAIL#gmail.com>
To: SOMEONE <SOMEONE#aol.com>
Content-Type: multipart/mixed; boundary=BNDRY1
--BNDRY1
Content-Type: multipart/alternative; boundary=BNDRY2
--BNDRY2
Content-Type: text/plain; charset=ISO-8859-1
-changed signature methods to conform more to working clinic header
methods(please test/not testable in simulator)
-confirmed that signature image is showing up in simulator. Awaiting
further tests
-Modified findings spacing/buffer. See if you like it
--BNDRY2
Content-Type: text/html; charset=ISO-8859-1
<div dir="ltr">-changed signature methods to conform more to working clinic header methods(please test/not testable in simulator)<div style>-confirmed that signature image is showing up in simulator. Awaiting further tests</div>
<div style>-Modified findings spacing/buffer. See if you like it</div></div>
--BNDRY2--
--BNDRY1
Content-Type: application/zip; name="Make it Brief.ipa.zip"
Content-Disposition: attachment; filename="Make it Brief.ipa.zip"
Content-Transfer-Encoding: base64
X-Attachment-Id: f_hg9biuno0
<<FILE DATA>>
--BNDRY1--
The issue was in the regex. There may be a cooler way to do it, but I just created a search string literal based off of the variables.
def recursiveSplit(chunk,boundary):
if type(chunk) is types.StringType:
#ar = re.split(r'(?P<boundary>)(?!--)',chunk)
searchString = "--%s" % boundary
print searchString
ar = re.split(searchString,chunk)
return ar
if type(chunk) is types.ListType:
i = 0
while i < len(chunk):
chunk[i] = recursiveSplit(chunk[i],boundary)
i += 1
return obj

Categories

Resources