I have an excel file with user defined business rules as below:
Column_Name|Operator|Column_Value1|Operand|RuleID|Result
ABC | Equal| 12| and| 1| 1
CDE | Equal| 10| and| 1| 1
XYZ | Equal| AD| | 1| 1.5
ABC | Equal| 11| and| 2| 1
CDE | Equal| 10| | 2| 1.2
and so on. (just for formatting purpose have put | symbol).
Input file (CSV) will look like below:
ABC,CDE,XYZ
12,10,AD
11,10,AD
Goal here is to derive an output column called Result which needs to be looked up to the user defined business rule excel.
Output Expected:
ABC,CDE,XYZ,Result
12,10,AD,1.5
11,10,AD,1.2
I have so far tried to generate an if statement and trying to assign the entire if/elif statement to a function. So that I can pass it to below statement to apply the rules.
ouput_df['result'] = input_df.apply(result_func, axis=1)
When I have the function with manually coding the rules it works as shown below:
def result_func(input_df):
if (input_df['ABC'] == 12):
return '1.25'
elif (ip_df['ABC'] == 11):
return '0.25'
else:
return '1'
Is this the right way of handling this scenario? If so how do I pass the entire dynamically generated if/elif to the function?
Code
import pandas as pd
import csv
# Load rules table
rules_table = []
with open('rules.csv') as csvfile:
reader = csv.DictReader(csvfile, delimiter='|')
for row in reader:
rules_table.append([x.strip() for x in row.values()])
# Load CSV file into DataFrame
df = pd.read_csv('data.csv', sep=",")
def rules_eval(row, rules):
" Steps through rules table for appropriate value "
def operator_eval(op, col, value):
if op == 'Equal':
return str(row[col]) == str(value)
else:
# Curently only Equal supported
raise ValueError(f"Unsupported Operator Value {op}, only Equal allowed")
prev_rule = '~'
for col, op, val, operand, rule, res in rules:
# loop through rows of rule table
if prev_rule != rule:
# rule ID changed so we can follow rule chains again
ignore_rule = False
if not ignore_rule:
if operator_eval(op, col, val):
if operand != 'and':
return res
else:
# Rule didn't work for an item in group
# ignore subsequent rules with this id
ignore_rule = True
prev_rule = rule
return None
df['results'] = df.apply(lambda row: rules_eval(row, rules_table), axis=1)
print(df)
Output
ABC CDE XYZ results
0 12 10 AD 1.5
1 11 10 AD 1.2
Explanation
df.apply - applies the rules_eval function to each row of the DataFrame.
The output is placed into column 'result' via
df['result'] = ...
Handling Rule Priority
Change
Added a Priority column to the rules_table so rules with the same RuleID are processed in order of priority.
Priority order decided by tuple ordering added to heap, currently
Priority, Column_Name, Operator, Column_Value, Operand, RuleID, Result
Code
import pandas as pd
import csv
from collections import namedtuple
from heapq import (heappush, heappop)
# Load CSV file into DataFrame
df = pd.read_csv('data.csv', sep=",")
class RulesEngine():
###########################################
# Static members
###########################################
# Named tuple for rules
fieldnames = 'Column_Name|Operator|Column_Value1|Operand|RuleID|Priority|Result'
Rule = namedtuple('Rule', fieldnames.replace('|', ' '))
number_fields = fieldnames.count('|') + 1
###########################################
# members
###########################################
def __init__(self, table_file):
# Load rules table
rules_table = []
with open(table_file) as csvfile:
reader = csv.DictReader(csvfile, delimiter='|')
for row in reader:
fields = [self.convert(x.strip()) for x in row.values() if x is not None]
if len(fields) != self.number_fields:
# Incorrect number of values
error = f"Rules require {self.number_fields} fields per row, was given {len(fields)}"
raise ValueError(error)
rules_table.append([self.convert(x.strip()) for x in row.values()])
#rules_table.append([x.strip() for x in row.values()])
self.rules_table = rules_table
def convert(self, s):
" Convert string to (int, float, or leave current value) "
try:
return int(s)
except ValueError:
try:
return float(s)
except ValueError:
return s
def operator_eval(self, row, rule):
" Determines value for a rule "
if rule.Operator == 'Equal':
return str(row[rule.Column_Name]) == str(rule.Column_Value1)
else:
# Curently only Equal supported
error = f"Unsupported Operator {rule.Operator}, only Equal allowed"
raise ValueError(error)
def get_rule_value(self, row, rule_queue):
" Value of a rule or None if no matching rule "
found_match = True
while rule_queue:
priority, rule_to_process = heappop(rule_queue)
if not self.operator_eval(row, rule_to_process):
found_match = False
break
return rule_to_process.Result if found_match else None
def rules_eval(self, row):
" Steps through rules table for appropriate value "
rule_queue = []
for index, r in enumerate(self.rules_table):
# Create named tuple with current rule values
current_rule = self.Rule(*r)
if not rule_queue or \
rule_queue[-1][1].RuleID == current_rule.RuleID:
# note: rule_queue[-1][1].RuleID is previous rule
# Within same rule group or last rule of group
priority = current_rule.Priority
# heap orders rules by pririty
# (lowest numbers are processed first)
heappush(rule_queue, (priority, current_rule))
if index < len(self.rules_table)-1:
continue # not at last rule, so keep accumulating
# Process rules in the rules queue
rule_value = self.get_rule_value(row, rule_queue)
if rule_value:
return rule_value
else:
# Starting over with new rule group
rule_queue = []
priority = current_rule.Priority
heappush(rule_queue, (priority, current_rule))
# Process Final queue if not empty
return self.get_rule_value(row, rule_queue)
# Init rules engine with rules from CSV file
rules_engine = RulesEngine('rules.csv')
df['results'] = df.apply(rules_engine.rules_eval, axis=1)
print(df)
Data Table
ABC,CDE,XYZ
12,10,AD
11,10,AD
12,12,AA
Rules Table
Column_Name|Operator|Column_Value1|Operand|RuleID|Priority|Result
ABC | Equal| 12| and| 1| 2|1
CDE | Equal| 10| and| 1| 1|1
XYZ | Equal| AD| and| 1| 3|1.5
ABC | Equal| 11| and| 2| 1|1
CDE | Equal| 10| foo| 2| 2|1.2
ABC | Equal| 12| foo| 3| 1|1.8
Output
ABC CDE XYZ results
0 12 10 AD 1.5
1 11 10 AD 1.2
2 12 12 AA 1.8
Related
When I created a hive table, the data is as follows.
data file
<__name__>abc
<__code__>1
<__value__>1234
<__name__>abcdef
<__code__>2
<__value__>12345
<__name__>abcdef
<__code__>2
<__value__>12345
1234156321
<__name__>abcdef
<__code__>2
<__value__>12345
...
Can I create a table right away without converting the file?
It's a plain text file, three columns are repeated.
How to convert dataframe? or csv file?
I want
| name | code | value
| abc | 1 | 1234
| abcdef | 2 | 12345
...
or
abc,1,1234
abcdef,2,12345
...
I solved my problem like this.
data = spark.read.text(path)
rows = data.rdd.zipWithIndex().map(lambda x: Row(x[0].value, int(x[1]/3)))
schema = StructType() \
.add("col1",StringType(), False) \
.add("record_pos",IntegerType(), False)
df = spark.createDataFrame(rows, schema)
df1 = df.withColumn("key", regexp_replace(split(df["col1"], '__>')[0], '<|__', '')) \
.withColumn("value", regexp_replace(regexp_replace(split(df["col1"], '__>')[1], '\n', '<NL>'), '\t', '<TAB>'))
dataframe = df1.groupBy("record_pos").pivot("key").agg(first("value")).drop("record_pos")
dataframe.show()
val path = "file:///C:/stackqustions/data/stackq5.csv"
val data = sc.textFile(path)
import spark.implicits._
val rdd = data.zipWithIndex.map {
case (records, index) => Row(records, index / 3)
}
val schema = new StructType().add("col1", StringType, false).add("record_pos", LongType, false)
val df = spark.createDataFrame(rdd, schema)
val df1 = df
.withColumn("key", regexp_replace(split($"col1", ">")(0), "<|__", ""))
.withColumn("value", split($"col1", ">")(1)).drop("col1")
df1.groupBy("record_pos").pivot("key").agg(first($"value")).drop("record_pos").show
result:
+----+------+-----+
|code| name|value|
+----+------+-----+
| 1| abc| 1234|
| 2|abcdef|12345|
| 2|abcdef|12345|
| 2|abcdef|12345|
+----+------+-----+
I have a existing data table with two columns, one is a ID and one is a list of IDs, separated by comma.
For example
ID | List
---------
1 | 1, 4, 5
3 | 2, 12, 1
I would like to split the column List so that I have a table like this:
ID | List
---------
1 | 1
1 | 4
1 | 5
3 | 2
3 | 12
3 | 1
I figured this out now:
tablename='Querysummary Data'
table=Document.Data.Tables[tablename]
topiccolname='TOPIC_ID'
topiccol=table.Columns[topiccolname]
topiccursor=DataValueCursor.Create[str](topiccol)
docscolname='DOC_IDS'
doccol=table.Columns[docscolname]
doccursor=DataValueCursor.Create[str](doccol)
myPanel = Document.ActivePageReference.FilterPanel
idxSet = myPanel.FilteringSchemeReference.FilteringSelectionReference.GetSelection(table).AsIndexSet()
keys=dict()
topdoc=dict()
for row in table.GetRows(idxSet,topiccursor,doccursor):
keys[topiccursor.CurrentValue]=doccursor.CurrentValue
for key in keys:
str = keys[key].split(",")
for i in str:
topdoc[key]=i
print key + " " +i
now I can print the topic id with the corresponding id.
How can I create a new data table in Spotfire using this dict()?
I solved it myself finally..maybe there is some better code but it works:
tablename='Querysummary Data'
table=Document.Data.Tables[tablename]
topiccolname='TOPIC_ID'
topiccol=table.Columns[topiccolname]
topiccursor=DataValueCursor.Create[str](topiccol)
docscolname='DOC_IDS'
doccol=table.Columns[docscolname]
doccursor=DataValueCursor.Create[str](doccol)
myPanel = Document.ActivePageReference.FilterPanel
idxSet = myPanel.FilteringSchemeReference.FilteringSelectionReference.GetSelection(table).AsIndexSet()
# build a string representing the data in tab-delimited text format
textData = "TOPIC_ID;DOC_IDS\r\n"
keys=dict()
topdoc=dict()
for row in table.GetRows(idxSet,topiccursor,doccursor):
keys[topiccursor.CurrentValue]=doccursor.CurrentValue
for key in keys:
str = keys[key].split(",")
for i in str:
textData += key + ";" + i + "\r\n"
dataSet = DataSet()
dataTable = DataTable("DOCIDS")
dataTable.Columns.Add("TOPIC_ID", System.String)
dataTable.Columns.Add("DOC_IDS", System.String)
dataSet.Tables.Add(dataTable)
# make a stream from the string
stream = MemoryStream()
writer = StreamWriter(stream)
writer.Write(textData)
writer.Flush()
stream.Seek(0, SeekOrigin.Begin)
# set up the text data reader
readerSettings = TextDataReaderSettings()
readerSettings.Separator = ";"
readerSettings.AddColumnNameRow(0)
readerSettings.SetDataType(0, DataType.String)
readerSettings.SetDataType(1, DataType.String)
readerSettings.SetDataType(2, DataType.String)
# create a data source to read in the stream
textDataSource = TextFileDataSource(stream, readerSettings)
# add the data into a Data Table in Spotfire
if Document.Data.Tables.Contains("Querysummary Mapping"):
Document.Data.Tables["Querysummary Mapping"].ReplaceData(textDataSource)
else:
newTable = Document.Data.Tables.Add("Querysummary Mapping", textDataSource)
tableSettings = DataTableSaveSettings (newTable, False, False)
Document.Data.SaveSettings.DataTableSettings.Add(tableSettings)
I am trying to read and parse a data dictionary for the Census Bureau's American Community Survey Public Use Microsample data release, as found here.
It is reasonably well formated, although with a few lapses where a few explanatory notes are inserted.
I think my preferred outcome is to either get a dataframe with one row per variable, and serialize all value labels for a given variable into one dictionary stored in a value dictionary field in the same row (although a hierarchical json-like format would not be bad, but more complicated.
I got the following code:
import pandas as pd
import re
import urllib2
data = urllib2.urlopen('http://www.census.gov/acs/www/Downloads/data_documentation/pums/DataDict/PUMSDataDict13.txt')
## replace newline characters so we can use dots and find everything until a double
## carriage return (replaced to ||) with a lookahead assertion.
data=data.replace('\n','|')
datadict=pd.DataFrame(re.findall("([A-Z]{2,8})\s{2,9}([0-9]{1})\s{2,6}\|\s{2,4}([A-Za-z\-\(\) ]{3,85})",data,re.MULTILINE),columns=['variable','width','description'])
datadict.head(5)
+----+----------+-------+------------------------------------------------+
| | variable | width | description |
+----+----------+-------+------------------------------------------------+
| 0 | RT | 1 | Record Type |
+----+----------+-------+------------------------------------------------+
| 1 | SERIALNO | 7 | Housing unit |
+----+----------+-------+------------------------------------------------+
| 2 | DIVISION | 1 | Division code |
+----+----------+-------+------------------------------------------------+
| 3 | PUMA | 5 | Public use microdata area code (PUMA) based on |
+----+----------+-------+------------------------------------------------+
| 4 | REGION | 1 | Region code |
+----+----------+-------+------------------------------------------------+
| 5 | ST | 2 | State Code |
+----+----------+-------+------------------------------------------------+
So far so good. The list of variables is there, along with the width in characters of each.
I can expand this and get additional lines (where the value labels live), like so:
datadict_exp=pd.DataFrame(
re.findall("([A-Z]{2,9})\s{2,9}([0-9]{1})\s{2,6}\|\s{4}([A-Za-z\-\(\)\;\<\> 0-9]{2,85})\|\s{11,15}([a-z0-9]{0,2})[ ]\.([A-Za-z/\-\(\) ]{2,120})",
data,re.MULTILINE))
datadict_exp.head(5)
+----+----------+-------+---------------------------------------------------+---------+--------------+
| id | variable | width | description | value_1 | label_1 |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 0 | DIVISION | 1 | Division code | 0 | Puerto Rico |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 1 | REGION | 1 | Region code | 1 | Northeast |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 2 | ST | 2 | State Code | 1 | Alabama/AL |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 3 | NP | 2 | Number of person records following this housin... | 0 | Vacant unit |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 4 | TYPE | 1 | Type of unit | 1 | Housing unit |
+----+----------+-------+---------------------------------------------------+---------+--------------+
So that gets the first value and associated label. My regex issue is here how to repeat the multi-line match starting with \s{11,15} and to the end--i.e. some variables have tons of unique values (ST or state code is followed by some 50 lines, denoting the value and label for each state).
I changed early on the carriage return in the source file with a pipe, thinking that I could then shamelessly rely on the dot to match everything until a double carriage return, indicating the end of that particular variable, and here is where I got stuck.
So--how to repeat a multi-line pattern an arbitrary number of times.
(A complication for later is that some variables are not fully enumerated in the dictionary, but are shown with valid ranges of values. NP for example [number of persons associated with the same household], is denoted with ``02..20` following a description. If I don't account for this, my parsing will miss such entries, of course.)
This isn't a regex, but I parsed PUMSDataDict2013.txt and PUMS_Data_Dictionary_2009-2013.txt (Census ACS 2013 documentation, FTP server) with this Python 3x script below. I used pandas.DataFrame.from_dict and pandas.concat to create a hierarchical dataframe, also below.
Python 3x function to parse PUMSDataDict2013.txt and PUMS_Data_Dictionary_2009-2013.txt:
import collections
import os
def parse_pumsdatadict(path:str) -> collections.OrderedDict:
r"""Parse ACS PUMS Data Dictionaries.
Args:
path (str): Path to downloaded data dictionary.
Returns:
ddict (collections.OrderedDict): Parsed data dictionary with original
key order preserved.
Raises:
FileNotFoundError: Raised if `path` does not exist.
Notes:
* Only some data dictionaries have been tested.[^urls]
* Values are all strings. No data types are inferred from the
original file.
* Example structure of returned `ddict`:
ddict['title'] = '2013 ACS PUMS DATA DICTIONARY'
ddict['date'] = 'August 7, 2015'
ddict['record_types']['HOUSING RECORD']['RT']\
['length'] = '1'
['description'] = 'Record Type'
['var_codes']['H'] = 'Housing Record or Group Quarters Unit'
ddict['record_types']['HOUSING RECORD'][...]
ddict['record_types']['PERSON RECORD'][...]
ddict['notes'] =
['Note for both Industry and Occupation lists...',
'* In cases where the SOC occupation code ends...',
...]
References:
[^urls]: http://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/
PUMSDataDict2013.txt
PUMS_Data_Dictionary_2009-2013.txt
"""
# Check arguments.
if not os.path.exists(path):
raise FileNotFoundError(
"Path does not exist:\n{path}".format(path=path))
# Parse data dictionary.
# Note:
# * Data dictionary keys and values are "codes for variables",
# using the ACS terminology,
# https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.html
# * The data dictionary is not all encoded in UTF-8. Replace encoding
# errors when found.
# * Catch instances of inconsistently formatted data.
ddict = collections.OrderedDict()
with open(path, encoding='utf-8', errors='replace') as fobj:
# Data dictionary name is line 1.
ddict['title'] = fobj.readline().strip()
# Data dictionary date is line 2.
ddict['date'] = fobj.readline().strip()
# Initialize flags to catch lines.
(catch_var_name, catch_var_desc,
catch_var_code, catch_var_note) = (None, )*4
var_name = None
var_name_last = 'PWGTP80' # Necessary for unformatted end-of-file notes.
for line in fobj:
# Replace tabs with 4 spaces
line = line.replace('\t', ' '*4).rstrip()
# Record type is section header 'HOUSING RECORD' or 'PERSON RECORD'.
if (line.strip() == 'HOUSING RECORD'
or line.strip() == 'PERSON RECORD'):
record_type = line.strip()
if 'record_types' not in ddict:
ddict['record_types'] = collections.OrderedDict()
ddict['record_types'][record_type] = collections.OrderedDict()
# A newline precedes a variable name.
# A newline follows the last variable code.
elif line == '':
# Example inconsistent format case:
# WGTP54 5
# Housing Weight replicate 54
#
# -9999..09999 .Integer weight of housing unit
if (catch_var_code
and 'var_codes' not in ddict['record_types'][record_type][var_name]):
pass
# Terminate the previous variable block and look for the next
# variable name, unless past last variable name.
else:
catch_var_code = False
catch_var_note = False
if var_name != var_name_last:
catch_var_name = True
# Variable name is 1 line with 0 space indent.
# Variable name is followed by variable description.
# Variable note is optional.
# Variable note is preceded by newline.
# Variable note is 1+ lines.
# Variable note is followed by newline.
elif (catch_var_name and not line.startswith(' ')
and var_name != var_name_last):
# Example: "Note: Public use microdata areas (PUMAs) ..."
if line.lower().startswith('note:'):
var_note = line.strip() # type(var_note) == str
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
# Append a new note.
ddict['record_types'][record_type][var_name]['notes'].append(var_note)
catch_var_note = True
# Example: """
# Note: Public Use Microdata Areas (PUMAs) designate areas ...
# population. Use with ST for unique code. PUMA00 applies ...
# ...
# """
elif catch_var_note:
var_note = line.strip() # type(var_note) == str
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
# Concatenate to most recent note.
ddict['record_types'][record_type][var_name]['notes'][-1] += ' '+var_note
# Example: "NWAB 1 (UNEDITED - See 'Employment Status Recode' (ESR))"
else:
# type(var_note) == list
(var_name, var_len, *var_note) = line.strip().split(maxsplit=2)
ddict['record_types'][record_type][var_name] = collections.OrderedDict()
ddict['record_types'][record_type][var_name]['length'] = var_len
# Append a new note if exists.
if len(var_note) > 0:
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
ddict['record_types'][record_type][var_name]['notes'].append(var_note[0])
catch_var_name = False
catch_var_desc = True
var_desc_indent = None
# Variable description is 1+ lines with 1+ space indent.
# Variable description is followed by variable code(s).
# Variable code(s) is 1+ line with larger whitespace indent
# than variable description. Example:"""
# PUMA00 5
# Public use microdata area code (PUMA) based on Census 2000 definition for data
# collected prior to 2012. Use in combination with PUMA10.
# 00100..08200 .Public use microdata area codes
# 77777 .Combination of 01801, 01802, and 01905 in Louisiana
# -0009 .Code classification is Not Applicable because data
# .collected in 2012 or later
# """
# The last variable code is followed by a newline.
elif (catch_var_desc or catch_var_code) and line.startswith(' '):
indent = len(line) - len(line.lstrip())
# For line 1 of variable description.
if catch_var_desc and var_desc_indent is None:
var_desc_indent = indent
var_desc = line.strip()
ddict['record_types'][record_type][var_name]['description'] = var_desc
# For lines 2+ of variable description.
elif catch_var_desc and indent <= var_desc_indent:
var_desc = line.strip()
ddict['record_types'][record_type][var_name]['description'] += ' '+var_desc
# For lines 1+ of variable codes.
else:
catch_var_desc = False
catch_var_code = True
is_valid_code = None
if not line.strip().startswith('.'):
# Example case: "01 .One person record (one person in household or"
if ' .' in line:
(var_code, var_code_desc) = line.strip().split(
sep=' .', maxsplit=1)
is_valid_code = True
# Example inconsistent format case:"""
# bbbb. N/A (age less than 15 years; never married)
# """
elif '. ' in line:
(var_code, var_code_desc) = line.strip().split(
sep='. ', maxsplit=1)
is_valid_code = True
else:
raise AssertionError(
"Program error. Line unaccounted for:\n" +
"{line}".format(line=line))
if is_valid_code:
if 'var_codes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['var_codes'] = collections.OrderedDict()
ddict['record_types'][record_type][var_name]['var_codes'][var_code] = var_code_desc
# Example case: ".any person in group quarters)"
else:
var_code_desc = line.strip().lstrip('.')
ddict['record_types'][record_type][var_name]['var_codes'][var_code] += ' '+var_code_desc
# Example inconsistent format case:"""
# ADJHSG 7
# Adjustment factor for housing dollar amounts (6 implied decimal places)
# """
elif (catch_var_desc and
'description' not in ddict['record_types'][record_type][var_name]):
var_desc = line.strip()
ddict['record_types'][record_type][var_name]['description'] = var_desc
catch_var_desc = False
catch_var_code = True
# Example inconsistent format case:"""
# WGTP10 5
# Housing Weight replicate 10
# -9999..09999 .Integer weight of housing unit
# WGTP11 5
# Housing Weight replicate 11
# -9999..09999 .Integer weight of housing unit
# """
elif ((var_name == 'WGTP10' and 'WGTP11' in line)
or (var_name == 'YOEP12' and 'ANC' in line)):
# type(var_note) == list
(var_name, var_len, *var_note) = line.strip().split(maxsplit=2)
ddict['record_types'][record_type][var_name] = collections.OrderedDict()
ddict['record_types'][record_type][var_name]['length'] = var_len
if len(var_note) > 0:
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
ddict['record_types'][record_type][var_name]['notes'].append(var_note[0])
catch_var_name = False
catch_var_desc = True
var_desc_indent = None
else:
if (catch_var_name, catch_var_desc,
catch_var_code, catch_var_note) != (False, )*4:
raise AssertionError(
"Program error. All flags to catch lines should be set " +
"to `False` by end-of-file.")
if var_name != var_name_last:
raise AssertionError(
"Program error. End-of-file notes should only be read "+
"after `var_name_last` has been processed.")
if 'notes' not in ddict:
ddict['notes'] = list()
ddict['notes'].append(line)
return ddict
Create the hierarchical dataframe (formatted below as Jupyter Notebook cells):
In [ ]:
import pandas as pd
ddict = parse_pumsdatadict(path=r'/path/to/PUMSDataDict2013.txt')
tmp = dict()
for record_type in ddict['record_types']:
tmp[record_type] = pd.DataFrame.from_dict(ddict['record_types'][record_type], orient='index')
df_ddict = pd.concat(tmp, names=['record_type', 'var_name'])
df_ddict.head()
Out[ ]:
# Click "Run code snippet" below to render the output from `df_ddict.head()`.
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th></th>
<th>length</th>
<th>description</th>
<th>var_codes</th>
<th>notes</th>
</tr>
<tr>
<th>record_type</th>
<th>var_name</th>
<th></th>
<th></th>
<th></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<th rowspan="5" valign="top">HOUSING RECORD</th>
<th>ACCESS</th>
<td>1</td>
<td>Access to the Internet</td>
<td>{'b': 'N/A (GQ)', '1': 'Yes, with subscription...</td>
<td>NaN</td>
</tr>
<tr>
<th>ACR</th>
<td>1</td>
<td>Lot size</td>
<td>{'b': 'N/A (GQ/not a one-family house or mobil...</td>
<td>NaN</td>
</tr>
<tr>
<th>ADJHSG</th>
<td>7</td>
<td>Adjustment factor for housing dollar amounts (...</td>
<td>{'1000000': '2013 factor (1.000000)'}</td>
<td>[Note: The value of ADJHSG inflation-adjusts r...</td>
</tr>
<tr>
<th>ADJINC</th>
<td>7</td>
<td>Adjustment factor for income and earnings doll...</td>
<td>{'1007549': '2013 factor (1.007549)'}</td>
<td>[Note: The value of ADJINC inflation-adjusts r...</td>
</tr>
<tr>
<th>AGS</th>
<td>1</td>
<td>Sales of Agriculture Products (Yearly sales)</td>
<td>{'b': 'N/A (GQ/vacant/not a one family house o...</td>
<td>[Note: no adjustment factor is applied to AGS.]</td>
</tr>
</tbody>
</table>
I have tables which looks like this:
text = """
ID = 1234
Hello World 135,343 117,668 81,228
Another line of text (30,632) (48,063)
More text 0 11,205 0
Even more text 1,447 681
ID = 18372
Another table 35,323 38,302 909,381
Another line with text 13 15
More text here 7 0
Even more text here 7,011 1,447 681
"""
Is there a way to replace the "blank" entries in each table with 0? I am trying to set delimiters between the entries, but using the following code can't deal with blank spots in the tables:
for line in text.splitlines():
if 'ID' not in line:
line1 = line.split()
line = '|'.join((' '.join(line1[:-3]), '|'.join(line1[-3:])))
print line
else:
print line
The output is:
ID = 1234
|
Hello World|135,343|117,668|81,228
Another line of|text|(30,632)|(48,063)
More text|0|11,205|0
Even more|text|1,447|681
|
ID = 18372
|
Another table|35,323|38,302|909,381
Another line with|text|13|15
More text|here|7|0
Even more text here|7,011|1,447|681
As you can see, the first problem shows up on the second line of the first table. The word 'text' is considered the first column. Any way to fix this in Python to replace blank entries with 0?
Here is a function for finding columns in a bunch of lines. The second argument pat defines what a column is, and can be any regex.
import itertools as it
import re
def find_columns(lines, pat = r' '):
'''
Usage:
widths = find_columns(lines)
for line in lines:
if not line: continue
vals = [ line[widths[i]:widths[i+1]].strip() for i in range(len(widths)-1) ]
'''
widths = []
maxlen = max(len(line) for line in lines)
for line in lines:
line = ''.join([line, ' '*(maxlen-len(line))])
candidates = []
for match in re.finditer(pat, line):
candidates.extend(range(match.start(), match.end()+1))
widths.append(set(candidates))
widths = sorted(set.intersection(*widths))
diffs = [widths[i+1]-widths[i] for i in range(len(widths)-1)]
diffs = [None]+diffs
widths = [w for d, w in zip(diffs, widths) if d != 1]
if widths[0] != 0: widths = [0]+widths
return widths
def report(text):
for key, group in it.groupby(text.splitlines(), lambda line:line.startswith('ID')):
lines = list(group)
if key:
print('\n'.join(lines))
else:
# r' (?![a-zA-Z])' defines a column to be any whitespace
# not followed by alphabetic characters.
widths = find_columns(lines, pat = r'\s(?![a-zA-Z])')
for line in lines:
if not line: continue
vals = [ line[widths[i]:widths[i+1]] for i in range(len(widths)-1) ]
vals = [v if v.strip() else v[1:]+'0' for v in vals]
print('|'.join(vals))
text = """\
ID = 1234
Hello World 135,343 117,668 81,228
Another line of text (30,632) (48,063)
More text 0 11,205 0
Even more text 1,447 681
ID = 18372
Another table 35,323 38,302 909,381
Another line with text 13 15
More text here 7 0
Even more text here 7,011 1,447 681
"""
report(text)
yields
ID = 1234
Hello World | 135,343| 117,668| 81,228
Another line of text| (30,632)| 0| (48,063)
More text | 0 | 11,205| 0
Even more text | 0| 1,447 | 681
ID = 18372
Another table | 35,323| 38,302| 909,381
Another line with text| 13 | 15|0
More text here | 0| 7 | 0
Even more text here | 7,011| 1,447| 681
DB table
select * from AAA;
id | name | grade
--------------------
1 | john | A
2 | cavin | B
django
grade_list = AAA.objects.all()
for item in grade_list:
print item.name
result ->
john
cavin
============================================
I want to change this code(same function)
grade_list = AAA.objects.all()
print_field = 'name'
for item in grade_list:
print item.( print_field... )
result ->
john
cavin
Q) fill in the blank please. Is it possible?
Yes you can do this with the getattr built-in function.
grade_list = AAA.objects.all()
print_field = 'name'
for item in grade_list:
print getattr(item, print_field)