I want to extract some specific information from CFT (tool to copy files from any OS to another OS).
I parse the output of the command "cftutil listcat content=full"
I want to extract dates, hours, number of records in local and remote file...
For examples I get lines like
Records NRECS = 49016 Records NREC = 49016
where I want to extract the value after NRECS or NREC
or lines like
Begin date DATEB = 15/02/2019 End date DATEE = 15/02/2019
Begin time TIMEB = 12:18:21.05 End time TIMEE = 12:18:23.16
where I want the date and hour for begin and end of transfer
My regex only gets the first match in the line
my program
import re
reg = r"""
(^.*)DIRECT\s+=\s(?P<direct>[A-Z]{4})
|
(^.*)DATEE\s+=\s(?P<date_end>\d{2}\/\d{2}\/\d{4})
|
(^.*)DATEB\s+=\s(?P<date_deb>\d{2}\/\d{2}\/\d{4})
|
(^.*)TIMEB\s+=\s(?P<hour_deb>\d{2}:\d{2}:\d{2}.\d{2})
|
(^.*)TIMEE\s+=\s(?P<hour_end>\d{2}:\d{2}:\d{2}.\d{2})
|
(^.*)NREC\s+=\s(?P<nb_records_loc>\d+)
|
(^.*)NRECS\s+=\s(?P<nbrecords_rem>\d+)
"""
pat1 = re.compile(reg,re.VERBOSE)
with open("CFT_FULL.TXT","r") as a:
source = a.read().split('\n')
for i, lin in enumerate(source):
if (" FNAME" in lin or " NFNAME" in lin):
print '\n'.join(source[i+1:i+2])
m = re.search(pat1, lin)
if m is not None:
print m.lastgroup, "---> ", m.group(m.lastindex)
if "JOBNAME" in lin :
print lin,'\n'
Edit: partial output
date_end ---> 06/02/2019
hour_deb ---> 08:19:48.63
nb_records_loc ---> 139
But I should have
date_deb ---> 06/02/2019
date_end ---> 06/02/2019
hour_deb ---> 08:19:48.63
hour_end ---> 08:19:49.52
Thanks for any hint
edit
Eventually, the following code works fine
m = re.search(pat1, lin)
if m is not None:
for x in re.finditer(pat1,lin):
print x.lastgroup,"--->", x.group(x.lastindex)
it prints
date_deb ---> 06/02/2019
date_end ---> 06/02/2019
hour_deb ---> 08:19:58.64
hour_end ---> 08:19:58.75
nbrecords_rem ---> 62
nbrecords_loc ---> 62
Please change your regex to ,
reg = r"""
DIRECT\s+=\s(?P<direct>[A-Z]{4})
|
DATEE\s+=\s(?P<date_end>\d{2}\/\d{2}\/\d{4})
|
DATEB\s+=\s(?P<date_deb>\d{2}\/\d{2}\/\d{4})
|
TIMEB\s+=\s(?P<hour_deb>\d{2}:\d{2}:\d{2}.\d{2})
|
TIMEE\s+=\s(?P<hour_end>\d{2}:\d{2}:\d{2}.\d{2})
|
NREC\s+=\s(?P<nb_records_loc>\d+)
|
NRECS\s+=\s(?P<nbrecords_rem>\d+)
"""
Also, as re.search() returns the first occurrence of a match, i would suggest to use, re.finditer(). Eg. for the below string:
Begin date DATEB = 15/02/2019 End date DATEE = 15/02/2019
Begin time TIMEB = 12:18:21.05 End time TIMEE = 12:18:23.16
The expected output will be,
>>> for x in re.finditer(pat1,above_string):
print(x)
<_sre.SRE_Match object; span=(13, 34), match='DATEB = 15/02/2019'>
<_sre.SRE_Match object; span=(46, 67), match='DATEE = 15/02/2019'>
<_sre.SRE_Match object; span=(82, 104), match='TIMEB = 12:18:21.05'>
<_sre.SRE_Match object; span=(115, 137), match='TIMEE = 12:18:23.16'>
Related
I'm trying to parse pactl list with pyparsing: So far all parse is working correctly but I cannot make ZeroOrMore to work correctly.
I can find foo: or foo: bar and try to deal with that with ZeroOrMore but it doesn't work, I have to add special case "Argument:" to find results without value, but there're Argument: foo results (with value) so it will not work, and I expect any other property to exist without value.
With this definition, and a fixed pactl list output:
#!/usr/bin/env python
#
# parsing pactl list
#
from pyparsing import *
import os
from subprocess import check_output
import sys
data = '''
Module #6
Argument:
Name: module-alsa-card
Usage counter: 0
Properties:
module.author = "Lennart Poettering"
module.description = "ALSA Card"
module.version = "14.0-rebootstrapped"
'''
indentStack = [1]
stmt = Forward()
identifier = Word(alphanums+"-_.")
sect_def = Group(Group(identifier) + Suppress("#") + Group(Word(nums)))
inner_section = indentedBlock(stmt, indentStack)
section = (sect_def + inner_section)
value = Group(Group(Combine(OneOrMore(identifier|White(' ')))) + Suppress(":") + Group(Combine(ZeroOrMore(Word(alphanums+'-/=_".')|White(' ', max=1)))))
prop_name = Literal("Properties:")
prop_section = indentedBlock(stmt, indentStack)
prop_val = Group(Group(identifier) + Suppress("=") + Group(Combine(OneOrMore(Word(alphanums+'-"/.')|White(' \t')))))
prop = (prop_name + prop_section)
stmt << ( section | prop | ("Argument:") | value | prop_val )
syntax = OneOrMore(stmt)
parseTree = syntax.parseString(data)
parseTree.pprint()
This gets:
$ ./pactl.py
Module #6
Argument:
Name: module-alsa-card
Usage counter: 0
Properties:
module.author = "Lennart Poettering"
module.description = "ALSA Card"
module.version = "14.0-rebootstrapped"
[[['Module'], ['6']],
[['Argument:'],
[[['Name'], ['module-alsa-card']]],
[[['Usage counter'], ['0']]],
['Properties:',
[[[['module.author'], ['"Lennart Poettering"']]],
[[['module.description'], ['"ALSA Card"']]],
[[['module.version'], ['"14.0-rebootstrapped"']]]]]]]
So far so good, but removing special case for Argument: it gets into error, as ZeroOrMore doesn't behave as expected:
#!/usr/bin/env python
#
# parsing pactl list
#
from pyparsing import *
import os
from subprocess import check_output
import sys
data = '''
Module #6
Argument:
Name: module-alsa-card
Usage counter: 0
Properties:
module.author = "Lennart Poettering"
module.description = "ALSA Card"
module.version = "14.0-rebootstrapped"
'''
indentStack = [1]
stmt = Forward()
identifier = Word(alphanums+"-_.")
sect_def = Group(Group(identifier) + Suppress("#") + Group(Word(nums)))
inner_section = indentedBlock(stmt, indentStack)
section = (sect_def + inner_section)
value = Group(Group(Combine(OneOrMore(identifier|White(' ')))) + Suppress(":") + Group(Combine(ZeroOrMore(Word(alphanums+'-/=_".')|White(' ', max=1))))).setDebug()
prop_name = Literal("Properties:")
prop_section = indentedBlock(stmt, indentStack)
prop_val = Group(Group(identifier) + Suppress("=") + Group(Combine(OneOrMore(Word(alphanums+'-"/.')|White(' \t')))))
prop = (prop_name + prop_section)
stmt << ( section | prop | value | prop_val )
syntax = OneOrMore(stmt)
parseTree = syntax.parseString(data)
parseTree.pprint()
This results in:
$ ./pactl.py
Module #6
Argument:
Name: module-alsa-card
Usage counter: 0
Properties:
module.author = "Lennart Poettering"
module.description = "ALSA Card"
module.version = "14.0-rebootstrapped"
Match Group:({Group:(Combine:({{W:(ABCD...) | <SP>}}...)) Suppress:(":") Group:(Combine:([{W:(ABCD...) | <SP>}]...))}) at loc 19(3,9)
Matched Group:({Group:(Combine:({{W:(ABCD...) | <SP>}}...)) Suppress:(":") Group:(Combine:([{W:(ABCD...) | <SP>}]...))}) -> [[['Argument'], ['Name']]]
Match Group:({Group:(Combine:({{W:(ABCD...) | <SP>}}...)) Suppress:(":") Group:(Combine:([{W:(ABCD...) | <SP>}]...))}) at loc 1(2,1)
Exception raised:Expected ":", found '#' (at char 8), (line:2, col:8)
Traceback (most recent call last):
File "/home/alberto/projects/node/pacmd_list_json/./pactl.py", line 55, in <module>
parseTree = syntax.parseString(partial)
File "/usr/local/lib/python3.9/site-packages/pyparsing.py", line 1955, in parseString
raise exc
File "/usr/local/lib/python3.9/site-packages/pyparsing.py", line 6336, in checkUnindent
raise ParseException(s, l, "not an unindent")
pyparsing.ParseException: Expected {{Group:({Group:(W:(ABCD...)) Suppress:("#") Group:(W:(0123...))}) indented block} | {"Properties:" indented block} | Group:({Group:(Combine:({{W:(ABCD...) | <SP>}}...)) Suppress:(":") Group:(Combine:([{W:(ABCD...) | <SP>}]...))}) | Group:({Group:(W:(ABCD...)) Suppress:("=") Group:(Combine:({{W:(ABCD...) | <SP><TAB>}}...))})}, found ':' (at char 41), (line:4, col:13)
See from setDebug value grammar ZeroOrMore is getting the tokens from next line [[['Argument'], ['Name']]]
I tried LineEnd() and other tricks but none works.
Any idea on how to deal with ZeroOrMore to stop on LineEnd() or without special cases?
NOTE: Real output can be retrieved using:
env = os.environ.copy()
env['LANG'] = 'C'
data = check_output(
['pactl', 'list'], universal_newlines=True, env=env)
indentedBlock is not the easiest pyparsing element to work with. But there are a few things that you are doing that are getting in your way.
To debug this, I broke down some of your more complex expressions, use setName() to give them names, and then added .setDebug(). Like this:
identifier = Word(alphas, alphanums+"-_.").setName("identifier").setDebug()
This will tell pyparsing to output a message whenever this expression is about to be matched, if it matched successfully, or if not, the exception that was raised.
Match identifier at loc 1(2,1)
Matched identifier -> ['Module']
Match identifier at loc 15(3,5)
Matched identifier -> ['Argument']
Match identifier at loc 15(3,5)
Matched identifier -> ['Argument']
Match identifier at loc 23(3,13)
Exception raised:Expected identifier, found ':' (at char 23), (line:3, col:13)
It looks like these expressions are messing up the indentedBlock matching, by processing whitespace that should be indentation space:
Combine(OneOrMore(Word(alphanums+'-"/.')|White(' \t')))
The " character in the Word and the whitespace lead me to believe you are trying to match quoted strings. I replaced this expression with:
Combine(OneOrMore(Word(alphas, alphanums+'-/.') | quotedString))
You also need to take care not to read past the end of the line, or you'll also mess up the indentedBlock indentation tracking. I added this expression for a newline at the top:
NL = LineEnd()
and then used it as the stopOn argument to OneOrMore and ZeroOrMore:
prop_val_value = Combine(OneOrMore(Word(alphas, alphanums+'-/.') | quotedString(), stopOn=NL)).setName("prop_val_value")#.setDebug()
prop_val = Group(identifier + Suppress("=") + Group(prop_val_value)).setName("prop_val")#.setDebug()
Here is the parser I ended up with:
indentStack = [1]
stmt = Forward()
NL = LineEnd()
identifier = Word(alphas, alphanums+"-_.").setName("identifier").setDebug()
sect_def = Group(Group(identifier) + Suppress("#") + Group(Word(nums))).setName("sect_def")#.setDebug()
inner_section = indentedBlock(stmt, indentStack)
section = (sect_def + inner_section)
#~ value = Group(Group(Combine(OneOrMore(identifier|White(' ')))) + Suppress(":") + Group(Combine(ZeroOrMore(Word(alphanums+'-/=_".')|White(' ', max=1))))).setDebug()
value_label = originalTextFor(OneOrMore(identifier)).setName("value_label")#.setDebug()
value = Group(value_label
+ Suppress(":")
+ Optional(~NL + Group(Combine(ZeroOrMore(Word(alphanums+'-/=_.') | quotedString(), stopOn=NL))))).setName("value")#.setDebug()
prop_name = Literal("Properties:")
prop_section = indentedBlock(stmt, indentStack)
#~ prop_val = Group(Group(identifier) + Suppress("=") + Group(Combine(OneOrMore(Word(alphanums+'-"/.')|White(' \t')))))
prop_val_value = Combine(OneOrMore(Word(alphas, alphanums+'-/.') | quotedString(), stopOn=NL)).setName("prop_val_value")#.setDebug()
prop_val = Group(identifier + Suppress("=") + Group(prop_val_value)).setName("prop_val")#.setDebug()
prop = (prop_name + prop_section).setName("prop")#.setDebug()
stmt << ( section | prop | value | prop_val )
Which gives this:
[[['Module'], ['6']],
[[['Argument']],
[['Name', ['module-alsa-card']]],
[['Usage counter', ['0']]],
['Properties:',
[[['module.author', ['"Lennart Poettering"']]],
[['module.description', ['"ALSA Card"']]],
[['module.version', ['"14.0-rebootstrapped"']]]]]]]
I have a table of tennis matches and I would like to create a query field that counts the number of previous matches within a date range of the date of the focal match. For example:
+----------+------------+---------------------------+
| match_id | date | num_matches within 3 days |
+----------+------------+---------------------------+
| 1 | 01/01/2020 | 0 |
| 2 | 02/01/2020 | 1 |
| 3 | 03/01/2020 | 2 |
| 4 | 05/01/2020 | 2 |
| 5 | 05/01/2020 | 3 |
| 6 | 10/01/2020 | 0 |
+----------+------------+---------------------------+
I've tried using a correlated sub-query as set out here but I can't figure out how to create the conditions:
#hybrid_method
def match_count(self, timespan_days):
cut_off = self.date_time_inferred - timedelta(days=timespan_days)
return sum(x >= cut_off and <some_cond_for_less_than_current_match_date> for x in self.date_time_inferred)
#match_count.expression
def match_count(cls, timespan_days):
cut_off = cls.date_time_inferred - timedelta(days=timespan_days)
return (
select(func.count(cls.date_time_inferred)).
where(
and_(
cls.date_time_inferred.__ge__(cut_off),
cls.date_time_inferred.__lt__(<not_sure_what_goes_here>),
)
).label('match_count')
)
In trialling simpler forms of the above I also kept getting the following error:
sqlalchemy.exc.ArgumentError: columns argument to select() must be a Python list or other iterable
On the code:
select(func.count(cls.date))
So I'm clearly doing a whole heap of stuff wrong.
Hopefully someone can help me understand how to fix this? Also very open to different ways to accomplish the same thing. I would like to stick with a hybrid attribute though...
Below is the (almost) full code snippet:
# ... omitted import statements and session configuration
def _date(date_str):
return datetime.strptime(date_str, "%Y-%m-%d")
class Match(Base):
__tablename__ = "match"
match_id = Column(Integer, primary_key=True)
date = Column(Date, nullable=False)
#hybrid_method
def match_count(self, timespan_days):
cut_off = self.date - timedelta(days=timespan_days)
sess = object_session(self)
M = Match
q = (
sess.query(M)
# .filter(M.match_id != self.match_id) # option-1: only other on the same day
.filter(M.match_id < self.match_id) # option-2: only smaller-id on the same day (as in OP)
.filter(M.date <= self.date)
.filter(M.date >= cut_off)
)
return q.count()
#match_count.expression
def match_count(cls, timespan_days):
M = aliased(Match, name="other")
cut_off = cls.date - timespan_days
q = (
select([func.count(M.match_id)])
# .filter(Match.match_id != self.match_id) # option-1: only other on the same day
.where(M.match_id < cls.match_id) # option-2: only smaller-id on the same day (as in OP)
.where(M.date <= cls.date)
.where(M.date >= cut_off)
)
return q.label("match_count")
def test():
Base.metadata.drop_all()
Base.metadata.create_all()
from sys import version_info as py_version
from sqlalchemy import __version__ as sa_version
print(f"PY version={py_version}")
print(f"SA version={sa_version}")
print(f"SA engine={engine.name}")
print("=" * 80)
# 1. test data
matches = [
Match(date=_date("2020-01-01")),
Match(date=_date("2020-01-02")),
Match(date=_date("2020-01-03")),
Match(date=_date("2020-01-05")),
Match(date=_date("2020-01-05")),
Match(date=_date("2020-01-10")),
]
session.add_all(matches)
session.commit()
print("=" * 80)
# 2. test query in "in-memory"
for m in session.query(Match):
print(m, m.match_count(3))
print("=" * 80)
# 3. test query on "SQL"
session.expunge_all()
q = session.query(Match, Match.match_count(3))
for match, match_count in q:
print(match, match_count)
print("=" * 80)
if __name__ == "__main__":
test()
The code above produces the following output:
============================================================
PY version=sys.version_info(major=3, minor=8, micro=1, releaselevel='final', serial=0)
SA version=1.3.20
SA engine=postgresql
============================================================
<Match(date=datetime.date(2020, 1, 1), match_id=1)> 0
<Match(date=datetime.date(2020, 1, 2), match_id=2)> 1
<Match(date=datetime.date(2020, 1, 3), match_id=3)> 2
<Match(date=datetime.date(2020, 1, 5), match_id=4)> 2
<Match(date=datetime.date(2020, 1, 5), match_id=5)> 3
<Match(date=datetime.date(2020, 1, 10), match_id=6)> 0
============================================================
<Match(date=datetime.date(2020, 1, 1), match_id=1)> 0
<Match(date=datetime.date(2020, 1, 2), match_id=2)> 1
<Match(date=datetime.date(2020, 1, 3), match_id=3)> 2
<Match(date=datetime.date(2020, 1, 5), match_id=4)> 2
<Match(date=datetime.date(2020, 1, 5), match_id=5)> 3
<Match(date=datetime.date(2020, 1, 10), match_id=6)> 0
============================================================
whereas the query q would like like below (in postgresql):
SELECT match.match_id,
match.date,
(SELECT count(other.match_id) AS count_1
FROM match AS other
WHERE other.match_id < match.match_id
AND other.date <= match.date
AND other.date >= match.date - %(date_1)s) AS match_count
FROM match
One item i would like to point out is that the "in-memory" check is not very efficient, because one has to query the database for each Match instance. Therefore, I would use the last query if possible.
I have a existing data table with two columns, one is a ID and one is a list of IDs, separated by comma.
For example
ID | List
---------
1 | 1, 4, 5
3 | 2, 12, 1
I would like to split the column List so that I have a table like this:
ID | List
---------
1 | 1
1 | 4
1 | 5
3 | 2
3 | 12
3 | 1
I figured this out now:
tablename='Querysummary Data'
table=Document.Data.Tables[tablename]
topiccolname='TOPIC_ID'
topiccol=table.Columns[topiccolname]
topiccursor=DataValueCursor.Create[str](topiccol)
docscolname='DOC_IDS'
doccol=table.Columns[docscolname]
doccursor=DataValueCursor.Create[str](doccol)
myPanel = Document.ActivePageReference.FilterPanel
idxSet = myPanel.FilteringSchemeReference.FilteringSelectionReference.GetSelection(table).AsIndexSet()
keys=dict()
topdoc=dict()
for row in table.GetRows(idxSet,topiccursor,doccursor):
keys[topiccursor.CurrentValue]=doccursor.CurrentValue
for key in keys:
str = keys[key].split(",")
for i in str:
topdoc[key]=i
print key + " " +i
now I can print the topic id with the corresponding id.
How can I create a new data table in Spotfire using this dict()?
I solved it myself finally..maybe there is some better code but it works:
tablename='Querysummary Data'
table=Document.Data.Tables[tablename]
topiccolname='TOPIC_ID'
topiccol=table.Columns[topiccolname]
topiccursor=DataValueCursor.Create[str](topiccol)
docscolname='DOC_IDS'
doccol=table.Columns[docscolname]
doccursor=DataValueCursor.Create[str](doccol)
myPanel = Document.ActivePageReference.FilterPanel
idxSet = myPanel.FilteringSchemeReference.FilteringSelectionReference.GetSelection(table).AsIndexSet()
# build a string representing the data in tab-delimited text format
textData = "TOPIC_ID;DOC_IDS\r\n"
keys=dict()
topdoc=dict()
for row in table.GetRows(idxSet,topiccursor,doccursor):
keys[topiccursor.CurrentValue]=doccursor.CurrentValue
for key in keys:
str = keys[key].split(",")
for i in str:
textData += key + ";" + i + "\r\n"
dataSet = DataSet()
dataTable = DataTable("DOCIDS")
dataTable.Columns.Add("TOPIC_ID", System.String)
dataTable.Columns.Add("DOC_IDS", System.String)
dataSet.Tables.Add(dataTable)
# make a stream from the string
stream = MemoryStream()
writer = StreamWriter(stream)
writer.Write(textData)
writer.Flush()
stream.Seek(0, SeekOrigin.Begin)
# set up the text data reader
readerSettings = TextDataReaderSettings()
readerSettings.Separator = ";"
readerSettings.AddColumnNameRow(0)
readerSettings.SetDataType(0, DataType.String)
readerSettings.SetDataType(1, DataType.String)
readerSettings.SetDataType(2, DataType.String)
# create a data source to read in the stream
textDataSource = TextFileDataSource(stream, readerSettings)
# add the data into a Data Table in Spotfire
if Document.Data.Tables.Contains("Querysummary Mapping"):
Document.Data.Tables["Querysummary Mapping"].ReplaceData(textDataSource)
else:
newTable = Document.Data.Tables.Add("Querysummary Mapping", textDataSource)
tableSettings = DataTableSaveSettings (newTable, False, False)
Document.Data.SaveSettings.DataTableSettings.Add(tableSettings)
I am trying to read and parse a data dictionary for the Census Bureau's American Community Survey Public Use Microsample data release, as found here.
It is reasonably well formated, although with a few lapses where a few explanatory notes are inserted.
I think my preferred outcome is to either get a dataframe with one row per variable, and serialize all value labels for a given variable into one dictionary stored in a value dictionary field in the same row (although a hierarchical json-like format would not be bad, but more complicated.
I got the following code:
import pandas as pd
import re
import urllib2
data = urllib2.urlopen('http://www.census.gov/acs/www/Downloads/data_documentation/pums/DataDict/PUMSDataDict13.txt')
## replace newline characters so we can use dots and find everything until a double
## carriage return (replaced to ||) with a lookahead assertion.
data=data.replace('\n','|')
datadict=pd.DataFrame(re.findall("([A-Z]{2,8})\s{2,9}([0-9]{1})\s{2,6}\|\s{2,4}([A-Za-z\-\(\) ]{3,85})",data,re.MULTILINE),columns=['variable','width','description'])
datadict.head(5)
+----+----------+-------+------------------------------------------------+
| | variable | width | description |
+----+----------+-------+------------------------------------------------+
| 0 | RT | 1 | Record Type |
+----+----------+-------+------------------------------------------------+
| 1 | SERIALNO | 7 | Housing unit |
+----+----------+-------+------------------------------------------------+
| 2 | DIVISION | 1 | Division code |
+----+----------+-------+------------------------------------------------+
| 3 | PUMA | 5 | Public use microdata area code (PUMA) based on |
+----+----------+-------+------------------------------------------------+
| 4 | REGION | 1 | Region code |
+----+----------+-------+------------------------------------------------+
| 5 | ST | 2 | State Code |
+----+----------+-------+------------------------------------------------+
So far so good. The list of variables is there, along with the width in characters of each.
I can expand this and get additional lines (where the value labels live), like so:
datadict_exp=pd.DataFrame(
re.findall("([A-Z]{2,9})\s{2,9}([0-9]{1})\s{2,6}\|\s{4}([A-Za-z\-\(\)\;\<\> 0-9]{2,85})\|\s{11,15}([a-z0-9]{0,2})[ ]\.([A-Za-z/\-\(\) ]{2,120})",
data,re.MULTILINE))
datadict_exp.head(5)
+----+----------+-------+---------------------------------------------------+---------+--------------+
| id | variable | width | description | value_1 | label_1 |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 0 | DIVISION | 1 | Division code | 0 | Puerto Rico |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 1 | REGION | 1 | Region code | 1 | Northeast |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 2 | ST | 2 | State Code | 1 | Alabama/AL |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 3 | NP | 2 | Number of person records following this housin... | 0 | Vacant unit |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 4 | TYPE | 1 | Type of unit | 1 | Housing unit |
+----+----------+-------+---------------------------------------------------+---------+--------------+
So that gets the first value and associated label. My regex issue is here how to repeat the multi-line match starting with \s{11,15} and to the end--i.e. some variables have tons of unique values (ST or state code is followed by some 50 lines, denoting the value and label for each state).
I changed early on the carriage return in the source file with a pipe, thinking that I could then shamelessly rely on the dot to match everything until a double carriage return, indicating the end of that particular variable, and here is where I got stuck.
So--how to repeat a multi-line pattern an arbitrary number of times.
(A complication for later is that some variables are not fully enumerated in the dictionary, but are shown with valid ranges of values. NP for example [number of persons associated with the same household], is denoted with ``02..20` following a description. If I don't account for this, my parsing will miss such entries, of course.)
This isn't a regex, but I parsed PUMSDataDict2013.txt and PUMS_Data_Dictionary_2009-2013.txt (Census ACS 2013 documentation, FTP server) with this Python 3x script below. I used pandas.DataFrame.from_dict and pandas.concat to create a hierarchical dataframe, also below.
Python 3x function to parse PUMSDataDict2013.txt and PUMS_Data_Dictionary_2009-2013.txt:
import collections
import os
def parse_pumsdatadict(path:str) -> collections.OrderedDict:
r"""Parse ACS PUMS Data Dictionaries.
Args:
path (str): Path to downloaded data dictionary.
Returns:
ddict (collections.OrderedDict): Parsed data dictionary with original
key order preserved.
Raises:
FileNotFoundError: Raised if `path` does not exist.
Notes:
* Only some data dictionaries have been tested.[^urls]
* Values are all strings. No data types are inferred from the
original file.
* Example structure of returned `ddict`:
ddict['title'] = '2013 ACS PUMS DATA DICTIONARY'
ddict['date'] = 'August 7, 2015'
ddict['record_types']['HOUSING RECORD']['RT']\
['length'] = '1'
['description'] = 'Record Type'
['var_codes']['H'] = 'Housing Record or Group Quarters Unit'
ddict['record_types']['HOUSING RECORD'][...]
ddict['record_types']['PERSON RECORD'][...]
ddict['notes'] =
['Note for both Industry and Occupation lists...',
'* In cases where the SOC occupation code ends...',
...]
References:
[^urls]: http://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/
PUMSDataDict2013.txt
PUMS_Data_Dictionary_2009-2013.txt
"""
# Check arguments.
if not os.path.exists(path):
raise FileNotFoundError(
"Path does not exist:\n{path}".format(path=path))
# Parse data dictionary.
# Note:
# * Data dictionary keys and values are "codes for variables",
# using the ACS terminology,
# https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.html
# * The data dictionary is not all encoded in UTF-8. Replace encoding
# errors when found.
# * Catch instances of inconsistently formatted data.
ddict = collections.OrderedDict()
with open(path, encoding='utf-8', errors='replace') as fobj:
# Data dictionary name is line 1.
ddict['title'] = fobj.readline().strip()
# Data dictionary date is line 2.
ddict['date'] = fobj.readline().strip()
# Initialize flags to catch lines.
(catch_var_name, catch_var_desc,
catch_var_code, catch_var_note) = (None, )*4
var_name = None
var_name_last = 'PWGTP80' # Necessary for unformatted end-of-file notes.
for line in fobj:
# Replace tabs with 4 spaces
line = line.replace('\t', ' '*4).rstrip()
# Record type is section header 'HOUSING RECORD' or 'PERSON RECORD'.
if (line.strip() == 'HOUSING RECORD'
or line.strip() == 'PERSON RECORD'):
record_type = line.strip()
if 'record_types' not in ddict:
ddict['record_types'] = collections.OrderedDict()
ddict['record_types'][record_type] = collections.OrderedDict()
# A newline precedes a variable name.
# A newline follows the last variable code.
elif line == '':
# Example inconsistent format case:
# WGTP54 5
# Housing Weight replicate 54
#
# -9999..09999 .Integer weight of housing unit
if (catch_var_code
and 'var_codes' not in ddict['record_types'][record_type][var_name]):
pass
# Terminate the previous variable block and look for the next
# variable name, unless past last variable name.
else:
catch_var_code = False
catch_var_note = False
if var_name != var_name_last:
catch_var_name = True
# Variable name is 1 line with 0 space indent.
# Variable name is followed by variable description.
# Variable note is optional.
# Variable note is preceded by newline.
# Variable note is 1+ lines.
# Variable note is followed by newline.
elif (catch_var_name and not line.startswith(' ')
and var_name != var_name_last):
# Example: "Note: Public use microdata areas (PUMAs) ..."
if line.lower().startswith('note:'):
var_note = line.strip() # type(var_note) == str
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
# Append a new note.
ddict['record_types'][record_type][var_name]['notes'].append(var_note)
catch_var_note = True
# Example: """
# Note: Public Use Microdata Areas (PUMAs) designate areas ...
# population. Use with ST for unique code. PUMA00 applies ...
# ...
# """
elif catch_var_note:
var_note = line.strip() # type(var_note) == str
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
# Concatenate to most recent note.
ddict['record_types'][record_type][var_name]['notes'][-1] += ' '+var_note
# Example: "NWAB 1 (UNEDITED - See 'Employment Status Recode' (ESR))"
else:
# type(var_note) == list
(var_name, var_len, *var_note) = line.strip().split(maxsplit=2)
ddict['record_types'][record_type][var_name] = collections.OrderedDict()
ddict['record_types'][record_type][var_name]['length'] = var_len
# Append a new note if exists.
if len(var_note) > 0:
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
ddict['record_types'][record_type][var_name]['notes'].append(var_note[0])
catch_var_name = False
catch_var_desc = True
var_desc_indent = None
# Variable description is 1+ lines with 1+ space indent.
# Variable description is followed by variable code(s).
# Variable code(s) is 1+ line with larger whitespace indent
# than variable description. Example:"""
# PUMA00 5
# Public use microdata area code (PUMA) based on Census 2000 definition for data
# collected prior to 2012. Use in combination with PUMA10.
# 00100..08200 .Public use microdata area codes
# 77777 .Combination of 01801, 01802, and 01905 in Louisiana
# -0009 .Code classification is Not Applicable because data
# .collected in 2012 or later
# """
# The last variable code is followed by a newline.
elif (catch_var_desc or catch_var_code) and line.startswith(' '):
indent = len(line) - len(line.lstrip())
# For line 1 of variable description.
if catch_var_desc and var_desc_indent is None:
var_desc_indent = indent
var_desc = line.strip()
ddict['record_types'][record_type][var_name]['description'] = var_desc
# For lines 2+ of variable description.
elif catch_var_desc and indent <= var_desc_indent:
var_desc = line.strip()
ddict['record_types'][record_type][var_name]['description'] += ' '+var_desc
# For lines 1+ of variable codes.
else:
catch_var_desc = False
catch_var_code = True
is_valid_code = None
if not line.strip().startswith('.'):
# Example case: "01 .One person record (one person in household or"
if ' .' in line:
(var_code, var_code_desc) = line.strip().split(
sep=' .', maxsplit=1)
is_valid_code = True
# Example inconsistent format case:"""
# bbbb. N/A (age less than 15 years; never married)
# """
elif '. ' in line:
(var_code, var_code_desc) = line.strip().split(
sep='. ', maxsplit=1)
is_valid_code = True
else:
raise AssertionError(
"Program error. Line unaccounted for:\n" +
"{line}".format(line=line))
if is_valid_code:
if 'var_codes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['var_codes'] = collections.OrderedDict()
ddict['record_types'][record_type][var_name]['var_codes'][var_code] = var_code_desc
# Example case: ".any person in group quarters)"
else:
var_code_desc = line.strip().lstrip('.')
ddict['record_types'][record_type][var_name]['var_codes'][var_code] += ' '+var_code_desc
# Example inconsistent format case:"""
# ADJHSG 7
# Adjustment factor for housing dollar amounts (6 implied decimal places)
# """
elif (catch_var_desc and
'description' not in ddict['record_types'][record_type][var_name]):
var_desc = line.strip()
ddict['record_types'][record_type][var_name]['description'] = var_desc
catch_var_desc = False
catch_var_code = True
# Example inconsistent format case:"""
# WGTP10 5
# Housing Weight replicate 10
# -9999..09999 .Integer weight of housing unit
# WGTP11 5
# Housing Weight replicate 11
# -9999..09999 .Integer weight of housing unit
# """
elif ((var_name == 'WGTP10' and 'WGTP11' in line)
or (var_name == 'YOEP12' and 'ANC' in line)):
# type(var_note) == list
(var_name, var_len, *var_note) = line.strip().split(maxsplit=2)
ddict['record_types'][record_type][var_name] = collections.OrderedDict()
ddict['record_types'][record_type][var_name]['length'] = var_len
if len(var_note) > 0:
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
ddict['record_types'][record_type][var_name]['notes'].append(var_note[0])
catch_var_name = False
catch_var_desc = True
var_desc_indent = None
else:
if (catch_var_name, catch_var_desc,
catch_var_code, catch_var_note) != (False, )*4:
raise AssertionError(
"Program error. All flags to catch lines should be set " +
"to `False` by end-of-file.")
if var_name != var_name_last:
raise AssertionError(
"Program error. End-of-file notes should only be read "+
"after `var_name_last` has been processed.")
if 'notes' not in ddict:
ddict['notes'] = list()
ddict['notes'].append(line)
return ddict
Create the hierarchical dataframe (formatted below as Jupyter Notebook cells):
In [ ]:
import pandas as pd
ddict = parse_pumsdatadict(path=r'/path/to/PUMSDataDict2013.txt')
tmp = dict()
for record_type in ddict['record_types']:
tmp[record_type] = pd.DataFrame.from_dict(ddict['record_types'][record_type], orient='index')
df_ddict = pd.concat(tmp, names=['record_type', 'var_name'])
df_ddict.head()
Out[ ]:
# Click "Run code snippet" below to render the output from `df_ddict.head()`.
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th></th>
<th>length</th>
<th>description</th>
<th>var_codes</th>
<th>notes</th>
</tr>
<tr>
<th>record_type</th>
<th>var_name</th>
<th></th>
<th></th>
<th></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<th rowspan="5" valign="top">HOUSING RECORD</th>
<th>ACCESS</th>
<td>1</td>
<td>Access to the Internet</td>
<td>{'b': 'N/A (GQ)', '1': 'Yes, with subscription...</td>
<td>NaN</td>
</tr>
<tr>
<th>ACR</th>
<td>1</td>
<td>Lot size</td>
<td>{'b': 'N/A (GQ/not a one-family house or mobil...</td>
<td>NaN</td>
</tr>
<tr>
<th>ADJHSG</th>
<td>7</td>
<td>Adjustment factor for housing dollar amounts (...</td>
<td>{'1000000': '2013 factor (1.000000)'}</td>
<td>[Note: The value of ADJHSG inflation-adjusts r...</td>
</tr>
<tr>
<th>ADJINC</th>
<td>7</td>
<td>Adjustment factor for income and earnings doll...</td>
<td>{'1007549': '2013 factor (1.007549)'}</td>
<td>[Note: The value of ADJINC inflation-adjusts r...</td>
</tr>
<tr>
<th>AGS</th>
<td>1</td>
<td>Sales of Agriculture Products (Yearly sales)</td>
<td>{'b': 'N/A (GQ/vacant/not a one family house o...</td>
<td>[Note: no adjustment factor is applied to AGS.]</td>
</tr>
</tbody>
</table>
I have tables which looks like this:
text = """
ID = 1234
Hello World 135,343 117,668 81,228
Another line of text (30,632) (48,063)
More text 0 11,205 0
Even more text 1,447 681
ID = 18372
Another table 35,323 38,302 909,381
Another line with text 13 15
More text here 7 0
Even more text here 7,011 1,447 681
"""
Is there a way to replace the "blank" entries in each table with 0? I am trying to set delimiters between the entries, but using the following code can't deal with blank spots in the tables:
for line in text.splitlines():
if 'ID' not in line:
line1 = line.split()
line = '|'.join((' '.join(line1[:-3]), '|'.join(line1[-3:])))
print line
else:
print line
The output is:
ID = 1234
|
Hello World|135,343|117,668|81,228
Another line of|text|(30,632)|(48,063)
More text|0|11,205|0
Even more|text|1,447|681
|
ID = 18372
|
Another table|35,323|38,302|909,381
Another line with|text|13|15
More text|here|7|0
Even more text here|7,011|1,447|681
As you can see, the first problem shows up on the second line of the first table. The word 'text' is considered the first column. Any way to fix this in Python to replace blank entries with 0?
Here is a function for finding columns in a bunch of lines. The second argument pat defines what a column is, and can be any regex.
import itertools as it
import re
def find_columns(lines, pat = r' '):
'''
Usage:
widths = find_columns(lines)
for line in lines:
if not line: continue
vals = [ line[widths[i]:widths[i+1]].strip() for i in range(len(widths)-1) ]
'''
widths = []
maxlen = max(len(line) for line in lines)
for line in lines:
line = ''.join([line, ' '*(maxlen-len(line))])
candidates = []
for match in re.finditer(pat, line):
candidates.extend(range(match.start(), match.end()+1))
widths.append(set(candidates))
widths = sorted(set.intersection(*widths))
diffs = [widths[i+1]-widths[i] for i in range(len(widths)-1)]
diffs = [None]+diffs
widths = [w for d, w in zip(diffs, widths) if d != 1]
if widths[0] != 0: widths = [0]+widths
return widths
def report(text):
for key, group in it.groupby(text.splitlines(), lambda line:line.startswith('ID')):
lines = list(group)
if key:
print('\n'.join(lines))
else:
# r' (?![a-zA-Z])' defines a column to be any whitespace
# not followed by alphabetic characters.
widths = find_columns(lines, pat = r'\s(?![a-zA-Z])')
for line in lines:
if not line: continue
vals = [ line[widths[i]:widths[i+1]] for i in range(len(widths)-1) ]
vals = [v if v.strip() else v[1:]+'0' for v in vals]
print('|'.join(vals))
text = """\
ID = 1234
Hello World 135,343 117,668 81,228
Another line of text (30,632) (48,063)
More text 0 11,205 0
Even more text 1,447 681
ID = 18372
Another table 35,323 38,302 909,381
Another line with text 13 15
More text here 7 0
Even more text here 7,011 1,447 681
"""
report(text)
yields
ID = 1234
Hello World | 135,343| 117,668| 81,228
Another line of text| (30,632)| 0| (48,063)
More text | 0 | 11,205| 0
Even more text | 0| 1,447 | 681
ID = 18372
Another table | 35,323| 38,302| 909,381
Another line with text| 13 | 15|0
More text here | 0| 7 | 0
Even more text here | 7,011| 1,447| 681