Python: how to parse and and add contents to a text file

Python: how to parse and and add contents to a text file - python

Hello I have a network in a particular format, i.e. .gdf. However this is a text file in the following format
network:
nodedef>name VARCHAR,label VARCHAR
0,' 0 '
1,' 1 '
2,' 2 '
edgedef>node1 VARCHAR,node2 VARCHAR,weight DOUBLE
0,1,0.2
0,2,0.2
0,3,0.2
0,4,0.333333
where the first part refers to nodes and the second part to edges.
I want to add feature to read the file and add a feature to the nodes and return the following:
network:
nodedef>name VARCHAR,label VARCHAR, att1 VARCHAR
0,' 0 ', 'Paul'
1,' 1 ', 'Jack'
2,' 2 ', 'John'
edgedef>node1 VARCHAR,node2 VARCHAR,weight DOUBLE
0,1,0.2
0,2,0.2
0,3,0.2
0,4,0.333333

Here is some code that does the first half of what you asked for. It will parse the .GDF file and make the information available to you. Adding attributes and writing them is left as an exercise for the reader.
import ast
import collections
import re
def main():
parser = GDFParser()
with open('network.gdf') as file:
parser.read(file)
print(*parser.data, sep='\n')
def pivot(iterable):
columns = []
for row in iterable:
columns.extend([] for _ in range(len(row) - len(columns)))
for column, cell in zip(columns, row):
column.append(cell)
return columns
class GDFParser:
HEADER = re.compile('\w+:')
DEF = re.compile('\w+>\w+ (?:DOUBLE|VARCHAR)(?:,\w+ (?:DOUBLE|VARCHAR))*')
CAST = dict(DOUBLE=float, VARCHAR=str)
def __init__(self):
self.__header = None
self.__type = []
self.__data = []
#property
def header(self):
return self.__header
#property
def data(self):
return tuple(self.__data)
def read(self, file):
for line in file:
self.__read_line(line.strip())
def __read_line(self, line):
if self.HEADER.fullmatch(line):
self.__process_header(line)
elif self.DEF.fullmatch(line):
self.__process_def(line)
else:
self.__process_data(line)
def __process_header(self, line):
if self.header:
raise ValueError('header was previously set')
self.__header = line[:-1]
def __process_def(self, line):
name, fields = line.split('>')
columns, casts = pivot(field.split() for field in fields.split(','))
self.__type.append((collections.namedtuple(name, columns),
tuple(map(self.CAST.__getitem__, casts))))
def __process_data(self, line):
if not self.__type:
raise ValueError('a definition must come before its data')
kind, casts = self.__type[-1]
self.__data.append(kind(*(cast(item) for cast, item in
zip(casts, ast.literal_eval(line)))))
if __name__ == '__main__':
main()

Related

Iterate over data frame and create a text file

I want to create multiple line items for the agreement number. As you can see below i have used only agreement number[0] to print first data from dental_df[Agreement_number]
But i want to take all the items in the agreement number to be printed.
if i take that whole column def str gives me an error
AttributeError: 'Series' object has no attribute 'ljust'
can anybody help me. I am missing something
class Reporting_Information:
""" ES0388F4 UCCI 130 Record (Reporting Information) """
fields = (
('Agreement_Number',13),
("Contract_ID",27),
('Record_Identifier',3),
('Sequence_Number',7),
('Report_Count',2),
('Report_Qualifier_Code',2),
('Report_Qualifier',0),
('Effective_Date',10),
('Report_ID',30),
('Report_Qualifier2',10),
('Cancel_Date',0),
('Filler',396)
)
def __init__(self):
self.Agreement_Number =''
self.Contract_ID =''
self.Record_Identifier ='130'
self.Sequence_Number ='0'
self.Report_Count ='01'
self.Report_Qualifier_Code='LU'
self.Report_Qualifier =''
self.Effective_Date =''
self.Report_ID =''
self.Report_Qualifier2 =''
self.Cancel_Date = ''
self.Filler =''
def __str__(self):
for field_name, width in self.fields:
return ''.join([getattr(self, field_name).ljust(width)])
r = Reporting_Information()
r.Agreement_Number = dental_df['Agreement_Number'][0]
r1 = str(r)
print(r1)

SQLAlchemy with_entities with dynamic inputs

how can I provide the with_entities option with dynamic input?
At the moment I have to do this:
columns = [DataModel.col1, DataModel.col2, ...]
data = DataModel.query.order_by(DataModel.id.desc()).with_entities(*columns).first()
But what should I do, if I get the col name as query-string parameters and have to define it dynamically?
EDIT EDIT EDIT
solved it this way:
In my model I define the classmethod:
#classmethod
def find_by_filter(cls, attr):
search_columns = [getattr(cls, i) for i in attr]
return cls.query.order_by(cls.id.desc()).with_entities(*search_columns).first()
and then I can call it from my Rest API this way:
liste = ["column 1", "column2", "column3", "column4"]
data = DataModel.find_by_filter(liste)

I had this same challenge and this is how I solved it.
columns is just a CSV string eg "field1,field2,field3"
Could be further condensed but this is enough for readability.
def pick_Columns(modelClass, columns):
if len(columns) == 0:
return None
colArray = columns.split(',')
modelObjects = [eval(f'{modelClass.__name__}.{col}') for col in colArray]
return modelObjects
def Get_Data(**kwargs):
retVal = kwargs['db'].query(m_DataModel)
if kwargs.get('columns', None) is not None:
colObj = pick_Columns(m_DataModel, kwargs['columns'])
retVal = retVal.with_entities(*colObj)
retVal = [row._asdict() for row in retVal.all()] # with_entities returns a tuple instead of dict. Remove this line if that's what you want
else:
retVal = retVal.all()
return retVal

Parsing a (modified) RIS file with Python

I have a bunch of (modified) RIS files. The toy example looks like the following:
Record #1 of 2
ID: CN-01160769
AU: Uedo N
AU: Kasiser R
TI: Development of an E-learning system
SO: United European Gastroenterology Journal
YR: 2015
Record #2 of 2
ID: CN-01070265
AU: Krogh LQ
TI: E-learning in pediatric basic life support
SO: Resuscitation
YR: 2015
In brief, each record starts with Record # line and ends with two blank lines. The task is to parse the file and extract tags and fields.
Pasted below is my current code (adapted from here):
import re
class RIS:
""" RIS file structure """
def __init__(self, in_file=None):
""" Initialize and parse input """
self.records = []
if in_file:
self.parse(in_file)
def parse(self, in_file):
""" Parse input file """
self.current_tag = None
self.current_record = None
prog = re.compile("^([A-Z][A-Z0-9]): (.*)")
lines = []
# Eliminate blank lines
for line in in_file:
line = line.strip()
if len(line) > 0:
lines.append(line)
for line in lines:
match = prog.match(line)
if match:
tag = match.groups()[0]
field = match.groups()[1]
self.process_field(tag, field)
else:
raise ValueError(line)
def process_field(self, tag, field):
""" Process RIS file field """
if tag == "ID":
self.current_record = {tag: field}
elif tag == "YR":
self.records.append(self.current_record)
self.current_record = None
elif tag in ["AU", "AD"]:
if tag in self.current_record:
self.current_record[tag].append(field)
else:
self.current_record[tag] = [field]
else:
if not tag in self.current_record:
self.current_record[tag] = field
else:
error_str = "Duplicate tag: %s" % tag
raise ValueError(error_str)
def main():
""" Test the code """
import pprint
with open("test.ris", "rt") as ris_file:
ris = RIS(ris_file)
pp = pprint.PrettyPrinter()
pp.pprint(ris.records)
if __name__ == "__main__":
main()
The current code doesn't work, because it doesn't recognize the start tag (e.g., Record 1 of 2) and in addition it doesn't know where the record stops. In the current version of the code I add ID as a start tag and YR as stop tag. However, the code exit with the error:
ValueError: Record #1 of 2
Any suggestions how to properly adapt the code are greatly welcome.

you just need add a judge and break the Record #x of 2 line.
import re
class RIS:
""" RIS file structure """
def __init__(self, in_file=None):
""" Initialize and parse input """
self.records = []
if in_file:
self.parse(in_file)
def parse(self, in_file):
""" Parse input file """
self.current_tag = None
self.current_record = None
prog = re.compile("^([A-Z][A-Z0-9]): (.*)")
lines = []
# Eliminate blank lines
for line in in_file:
line = line.strip()
if len(line) > 0:
lines.append(line)
for line in lines:
if "#" in line:
continue
match = prog.match(line)
if match:
tag = match.groups()[0]
field = match.groups()[1]
self.process_field(tag, field)
else:
raise ValueError(line)
def process_field(self, tag, field):
""" Process RIS file field """
if tag == "ID":
self.current_record = {tag: field}
elif tag == "YR":
self.records.append(self.current_record)
self.current_record = None
elif tag in ["AU", "AD"]:
if tag in self.current_record:
self.current_record[tag].append(field)
else:
self.current_record[tag] = [field]
else:
if not tag in self.current_record:
self.current_record[tag] = field
else:
error_str = "Duplicate tag: %s" % tag
raise ValueError(error_str)
def main():
""" Test the code """
import pprint
with open("test.ris", "rt") as ris_file:
ris = RIS(ris_file)
pp = pprint.PrettyPrinter()
pp.pprint(ris.records)
if __name__ == "__main__":
main()
the add code:
if "#" in line:
continue
the output is
[{'AU': ['Uedo N', 'Kasiser R'],
'ID': 'CN-01160769',
'SO': 'United European Gastroenterology Journal',
'TI': 'Development of an E-learning system'},
{'AU': ['Krogh LQ'],
'ID': 'CN-01070265',
'SO': 'Resuscitation',
'TI': 'E-learning in pediatric basic life support'}]

how to loop list to pass string to function name in python

I'm trying to find the most efficient way to create different function name myfunction_a ,.. b , c with slightly different code ( input file name 'app/data/mydata_a.csv' ) so here below is the a function I got
def myfunction_a(request):
os.getcwd() # Should get this Django project root (where manage.py is)
fn = os.path.abspath(os.path.join(os.getcwd(),'app/data/mydata_a.csv'))
# TODO: Move to helper module
response_data = {}
data_format = 'tsv'
if data_format == 'json':
with open(fn, 'rb') as tsvin:
tsvin = csv.reader(tsvin, delimiter='\t')
for row in tsvin:
print 'col1 = %s col2 = %s' % (row[0], row[1])
response_data[row[0]] = row[1]
result = HttpResponse(json.dumps(response_data), content_type = 'application/json')
else:
with open(fn, 'rb') as tsvin:
buff = tsvin.read()
result = HttpResponse(buff, content_type = 'text/tsv')
return result
I want to be able to loop through my list and create multiple function name:
mylist = ['a','b','c' ... 'z' ]
def myfunction_a(request):
... ( 'app/data/mydata_a.csv' )
return request
to get final result of :
def myfunction_a => taking 'app/data/mydata_a.csv'
def myfunction_b => taking 'app/data/mydata_b.csv'
def myfunction_c => taking 'app/data/mydata_c.csv'
right now I just copy and past and change it. is there a better to do this ? Any recommendation would be appreciated. Thanks.

you can add a variable to a string with
"app/data/mydata_%s.csv" % (character)
so
for character in mylist:
print "app/data/mydata_%s.csv" % (character)
should append everytime another charcter at the place of %s
So since you want for every function use another string to get another file you can
do something like this:
def myfunction(label, request):
return "app/data/mydata_%s.csv" % (label)
so you get the function label at the end of your documentpath. Since you described that you
only want to change the name so that it equals to the function label, you only need another parameter and not a new function name

If you must have a special function name, you could do this. Though why you'd need to I'm not sure.
import functools, sys
namespace = sys._getframe(0).f_globals
def myfunction(label, request):
print request
return "app/data/mydata_%s.csv" % (label)
my_labels = ['a','b','c']
for label in my_labels:
namespace['myfunction_%s'%label] = functools.partial(myfunction, label)
print myfunction_a('request1')
print myfunction_b('request2')
Output is this:
request1
app/data/mydata_a.csv
request2
app/data/mydata_b.csv
Or possibly a better implementation would be:
class MyClass(object):
def __init__(self, labels):
for label in labels:
setattr(self, label, functools.partial(self._myfunction, label))
def _myfunction(self, label, request):
print request
return "app/data/mydata_%s.csv" % (label)
myfunction = MyClass(['a','b','c'])
print myfunction.c('request3')
Output is this:
request3
app/data/mydata_c.csv

Correcting errors in one file and writing it to a new file

Okay so I have a transaction file:
IN CU
Customer_ID=
Last_Name=Johnston
First_Name=Karen
Street_Address=291 Stone Cr
City=Toronto
//
IN VE
License_Plate#=LSR976
Make=Cadillac
Model=Seville
Year=1996
Owner_ID=779
//
IN SE
Vehicle_ID=LSR976
Service_Code=461
Date_Scheduled=00/12/19
IN means insert, and CU (means customer) refers to what file we are writing too, in this case it's customer.diff. The problem I'm having is that I need to go through each line, and check the value of each field (Customer_ID) for example. You see how Customer_ID is left blank? I need to replace any numeric blank fields with a value of 0, so for example Customer_ID=0 in this case. Here's what I have so far but nothing is changing:
def insertion():
field_names = {'Customer_ID=': 'Customer_ID=0',
'Home_Phone=':'Home_Phone=0','Business_Phone=': 'Business_Phone=0'}
with open('xactions.two.txt', 'r') as from_file:
search_lines = from_file.readlines()
if search_lines[3:5] == 'CU':
for i in search_lines:
if field_names[i] == True:
with open('customer.diff', 'w') as to_file:
to_file.write(field_names[i])
Thanks

Why not try something a little simpler? I haven't tested this code.
def insertion():
field_names = {'Customer_ID=': 'Customer_ID=0',
'Home_Phone=':'Home_Phone=0','Business_Phone=': 'Business_Phone=0'}
with open('xactions.two.txt', 'r') as from_file:
with open('customer.diff', 'w') as to_file:
for line in from_file:
line = line.rstrip("\n")
found = False
for field in field_names.keys():
if field in line:
to_file.write(line + "0")
found = True
if not found:
to_file.write(line)
to_file.write("\n")

Here's a fairly comprehensive approach; it's a bit long, but not as complicated as it looks!
I assume Python 3.x, although it should work in Python 2.x with few changes. I make extensive use of generators to stream data through rather than holding it in memory.
To start with: we are going to define the expected data-type for each field. Some fields do not correspond to built-in Python data types, so I start by defining some custom data types for those fields:
import time
class Date:
def __init__(self, s):
"""
Parse a date provided as "yy/mm/dd"
"""
if s.strip():
self.date = time.strptime(s, "%y/%m/%d")
else:
self.date = time.gmtime(0.)
def __str__(self):
"""
Return a date as "yy/mm/dd"
"""
return time.strftime("%y/%m/%d", self.date)
def Int(s):
"""
Parse a string to integer ("" => 0)
"""
if s.strip():
return int(s)
else:
return 0
class Year:
def __init__(self, s):
"""
Parse a year provided as "yyyy"
"""
if s.strip():
self.date = time.strptime(s, "%Y")
else:
self.date = time.gmtime(0.)
def __str__(self):
"""
Return a year as "yyyy"
"""
return time.strftime("%Y", self.date)
Now we set up a table, defining what type each field should be:
# Expected data-type of each field:
# data_types[section][field] = type
data_types = {
"CU": {
"Customer_ID": Int,
"Last_Name": str,
"First_Name": str,
"Street_Address": str,
"City": str
},
"VE": {
"License_Plate#": str,
"Make": str,
"Model": str,
"Year": Year,
"Owner_ID": Int
},
"SE": {
"Vehicle_ID": str,
"Service_Code": Int,
"Date_Scheduled": Date
}
}
We parse the input file; this is by far the most complicated bit! It's a finite state machine implemented as a generator function, yielding a section at a time:
# Customized error-handling
class TransactionError (BaseException): pass
class EntryNotInSectionError (TransactionError): pass
class MalformedLineError (TransactionError): pass
class SectionNotTerminatedError(TransactionError): pass
class UnknownFieldError (TransactionError): pass
class UnknownSectionError (TransactionError): pass
def read_transactions(fname):
"""
Read a transaction file
Return a series of ("section", {"key": "value"})
"""
section, accum = None, {}
with open(fname) as inf:
for line_no, line in enumerate(inf, 1):
line = line.strip()
if not line:
# blank line - skip it
pass
elif line == "//":
# end of section - return any accumulated data
if accum:
yield (section, accum)
section, accum = None, {}
elif line[:3] == "IN ":
# start of section
if accum:
raise SectionNotTerminatedError(
"Line {}: Preceding {} section was not terminated"
.format(line_no, section)
)
else:
section = line[3:].strip()
if section not in data_types:
raise UnknownSectionError(
"Line {}: Unknown section type {}"
.format(line_no, section)
)
else:
# data entry: "key=value"
if section is None:
raise EntryNotInSectionError(
"Line {}: '{}' should be in a section"
.format(line_no, line)
)
pair = line.split("=")
if len(pair) != 2:
raise MalformedLineError(
"Line {}: '{}' could not be parsed as a key/value pair"
.format(line_no, line)
)
key,val = pair
if key not in data_types[section]:
raise UnknownFieldError(
"Line {}: unrecognized field name {} in section {}"
.format(line_no, key, section)
)
accum[key] = val.strip()
# end of file - nothing should be left over
if accum:
raise SectionNotTerminatedError(
"End of file: Preceding {} section was not terminated"
.format(line_no, section)
)
Now that the file is read, the rest is easier. We do type-conversion on each field, using the lookup table we defined above:
def format_field(section, key, value):
"""
Cast a field value to the appropriate data type
"""
return data_types[section][key](value)
def format_section(section, accum):
"""
Cast all values in a section to the appropriate data types
"""
return (section, {key:format_field(section, key, value) for key,value in accum.items()})
and write the results back to file:
def write_transactions(fname, transactions):
with open(fname, "w") as outf:
for section,accum in transactions:
# start section
outf.write("IN {}\n".format(section))
# write key/value pairs in order by key
keys = sorted(accum.keys())
for key in keys:
outf.write(" {}={}\n".format(key, accum[key]))
# end section
outf.write("//\n")
All the machinery is in place; we just have to call it:
def main():
INPUT = "transaction.txt"
OUTPUT = "customer.diff"
transactions = read_transactions(INPUT)
cleaned_transactions = (format_section(section, accum) for section,accum in transactions)
write_transactions(OUTPUT, cleaned_transactions)
if __name__=="__main__":
main()
Hope that helps!

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python: how to parse and and add contents to a text file - python

Related

Iterate over data frame and create a text file

SQLAlchemy with_entities with dynamic inputs

Parsing a (modified) RIS file with Python

how to loop list to pass string to function name in python

Correcting errors in one file and writing it to a new file

Categories

Resources