This question already has answers here:
What is the purpose of the return statement? How is it different from printing?
(15 answers)
Closed 6 months ago.
import whois ; import time ; from datetime import datetime ; import requests ; import re
date = False
points = 0
class bcolors: # just adding colors, ignore this
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
def date_c():
if date:
print(f'{bcolors.OKGREEN}{td}Y{bcolors.ENDC}')
else:
print(f'{bcolors.FAIL}Less than a year{bcolors.ENDC}')
user_input = input('Site Link: ')
domain = whois.whois(user_input)
name = domain.domain_name[0]
d1 = domain.creation_date[0].strftime('%Y')
d2 = datetime.today().strftime('%Y')
td = int(d2) - int(d1)
if td >= 1:
points += 2
date = True
print(f'''
{'-' * (len(name) + 8)}
DOMAIN: {name}
| Age: {date_c()} # This is None, and the value is displayed outside from here
| Secure:
| Status:
| Emails:
| Address:
| City:
| Recently Updated:
{'-' * (len(name) + 8)}
''')
I am getting the age of google, then if the age(in years) is >= 1 then date = True, and if date = true, in the printf statement display it, but the value is outside and date_c is somehow None. Any help is appreciated!
the value is outside and date_c is somehow None
This function returns None because there's no explicit return statement:
def date_c():
if date:
print(f'{bcolors.OKGREEN}{td}Y{bcolors.ENDC}')
else:
print(f'{bcolors.FAIL}Less than a year{bcolors.ENDC}')
"The value is outside" because Python will call date_c in order to construct the resulting f-string, but date_c will print something to the terminal via its print calls and return None, which will be incorporated into the f-string. Only then the resulting string will be printed, and now there are two outputs.
You should probably return the strings from date_c:
def date_c():
if date:
return f'{bcolors.OKGREEN}{td}Y{bcolors.ENDC}'
else:
return f'{bcolors.FAIL}Less than a year{bcolors.ENDC}'
Related
I try to make a table (or csv, I'm using pandas dataframe) from the information of an XML file.
The file is here (.zip is 14 MB, XML is ~370MB), https://nvd.nist.gov/feeds/xml/cpe/dictionary/official-cpe-dictionary_v2.3.xml.zip . It has package information of different languages - node.js, python, java etc. aka, CPE 2.3 list by the US government org NVD.
this is how it looks like in the first 30 rows:
<cpe-list xmlns:config="http://scap.nist.gov/schema/configuration/0.1" xmlns="http://cpe.mitre.org/dictionary/2.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:scap-core="http://scap.nist.gov/schema/scap-core/0.3" xmlns:cpe-23="http://scap.nist.gov/schema/cpe-extension/2.3" xmlns:ns6="http://scap.nist.gov/schema/scap-core/0.1" xmlns:meta="http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2" xsi:schemaLocation="http://scap.nist.gov/schema/cpe-extension/2.3 https://scap.nist.gov/schema/cpe/2.3/cpe-dictionary-extension_2.3.xsd http://cpe.mitre.org/dictionary/2.0 https://scap.nist.gov/schema/cpe/2.3/cpe-dictionary_2.3.xsd http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2 https://scap.nist.gov/schema/cpe/2.1/cpe-dictionary-metadata_0.2.xsd http://scap.nist.gov/schema/scap-core/0.3 https://scap.nist.gov/schema/nvd/scap-core_0.3.xsd http://scap.nist.gov/schema/configuration/0.1 https://scap.nist.gov/schema/nvd/configuration_0.1.xsd http://scap.nist.gov/schema/scap-core/0.1 https://scap.nist.gov/schema/nvd/scap-core_0.1.xsd">
<generator>
<product_name>National Vulnerability Database (NVD)</product_name>
<product_version>4.9</product_version>
<schema_version>2.3</schema_version>
<timestamp>2022-03-17T03:51:01.909Z</timestamp>
</generator>
<cpe-item name="cpe:/a:%240.99_kindle_books_project:%240.99_kindle_books:6::~~~android~~">
<title xml:lang="en-US">$0.99 Kindle Books project $0.99 Kindle Books (aka com.kindle.books.for99) for android 6.0</title>
<references>
<reference href="https://play.google.com/store/apps/details?id=com.kindle.books.for99">Product information</reference>
<reference href="https://docs.google.com/spreadsheets/d/1t5GXwjw82SyunALVJb2w0zi3FoLRIkfGPc7AMjRF0r4/edit?pli=1#gid=1053404143">Government Advisory</reference>
</references>
<cpe-23:cpe23-item name="cpe:2.3:a:\$0.99_kindle_books_project:\$0.99_kindle_books:6:*:*:*:*:android:*:*"/>
</cpe-item>
The tree structure of the XML file is quite simple, the root is 'cpe-list', the child element is 'cpe-item', and the grandchild elements are 'title', 'references' and 'cpe23-item'.
From 'title', I want the text in the element;
From 'cpe23-item', I want the attribute 'name';
From 'references', I want the attributes 'href' from its great-grandchildren, 'reference'.
The dataframe should look like this:
| cpe23_name | title_text | ref1 | ref2 | ref3 | ref_other
0 | 'cpe23name 1'| 'this is a python pkg'| 'url1'| 'url2'| NaN | NaN
1 | 'cpe23name 2'| 'this is a java pkg' | 'url1'| 'url2'| NaN | NaN
...
my code is here,finished in ~100sec:
import xml.etree.ElementTree as et
xtree = et.parse("official-cpe-dictionary_v2.3.xml")
xroot = xtree.getroot()
import time
start_time = time.time()
df_cols = ["cpe", "text", "vendor", "product", "version", "changelog", "advisory", 'others']
title = '{http://cpe.mitre.org/dictionary/2.0}title'
ref = '{http://cpe.mitre.org/dictionary/2.0}references'
cpe_item = '{http://scap.nist.gov/schema/cpe-extension/2.3}cpe23-item'
p_cpe = None
p_text = None
p_vend = None
p_prod = None
p_vers = None
p_chan = None
p_advi = None
p_othe = None
rows = []
i = 0
while i < len(xroot):
for elm in xroot[i]:
if elm.tag == title:
p_text = elm.text
#assign p_text
elif elm.tag == ref:
for nn in elm:
s = nn.text.lower()
#check the lower text in refs
if 'version' in s:
p_vers = nn.attrib.get('href')
#assign p_vers
elif 'advisor' in s:
p_advi = nn.attrib.get('href')
#assign p_advi
elif 'product' in s:
p_prod = nn.attrib.get('href')
#assign p_prod
elif 'vendor' in s:
p_vend = nn.attrib.get('href')
#assign p_vend
elif 'change' in s:
p_chan = nn.attrib.get('href')
#assign p_vend
else:
p_othe = nn.attrib.get('href')
elif elm.tag == cpe_item:
p_cpe = elm.attrib.get("name")
#assign p_cpe
else:
print(elm.tag)
row = [p_cpe, p_text, p_vend, p_prod, p_vers, p_chan, p_advi, p_othe]
rows.append(row)
p_cpe = None
p_text = None
p_vend = None
p_prod = None
p_vers = None
p_chan = None
p_advi = None
p_othe = None
print(len(rows)) #this shows how far I got during the running time
i+=1
out_df1 = pd.DataFrame(rows, columns = df_cols)# move this part outside the loop by removing the indent
print("---853k rows take %s seconds ---" % (time.time() - start_time))
updated: the faster way is to move the 2nd last row out side the loop. Since 'rows' already get info in each loop, there is no need to make a new dataframe every time.
the running time now is 136.0491042137146 seconds. yay!
Since your XML is fairly flat, consider the recently added IO module, pandas.read_xml introduced in v1.3. Given XML uses a default namespace, to reference elements in xpath use namespaces argument:
url = "https://nvd.nist.gov/feeds/xml/cpe/dictionary/official-cpe-dictionary_v2.3.xml.zip"
df = pd.read_xml(
url, xpath=".//doc:cpe-item", namespaces={'doc': 'http://cpe.mitre.org/dictionary/2.0'}
)
If you do not have the default parser, lxml, installed, use the etree parser:
df = pd.read_xml(
url, xpath=".//doc:cpe-item", namespaces={'doc': 'http://cpe.mitre.org/dictionary/2.0'}, parser="etree"
)
I want to create comments from a dataset that details the growth rate, market share, etc for various markets and products. The dataset is in the form of a pd.DataFrame(). I would like the comment to include keywords like increase/decrease based on the calculations, for example, if 2020 Jan has sale of 1000, and 2021 Jan has a sale of 1600, then it will necessary mean an increase of 60%.
I defined a function outside as such and I would like to seek if this method is too clumsy, if so, how should I improve on it.
GrowthIncDec = namedtuple('gr_tuple', ['annual_growth_rate', 'quarterly_growth_rate'])
def increase_decrease(annual_gr, quarter_gr):
if annual_gr > 0:
annual_growth_rate = 'increased'
elif annual_gr < 0:
annual_growth_rate = 'decreased'
else:
annual_growth_rate = 'stayed the same'
if quarter_gr > 0:
quarterly_growth_rate = 'increased'
elif quarter_gr < 0:
quarterly_growth_rate = 'decreased'
else:
quarterly_growth_rate = 'stayed the same'
gr_named_tuple = GrowthIncDec(annual_growth_rate=annual_growth_rate, quarterly_growth_rate=quarterly_growth_rate)
return gr_named_tuple
myfunc = increase_decrease(5, -1)
myfunc.annual_growth_rate
output: 'increased'
A snippet of my main code is as follows to illustrate the use of the above function:
def get_comments(grp, some_dict: Dict[str, List[str]]):
.......
try:
subdf = the dataframe
annual_gr = subdf['Annual_Growth'].values[0]
quarter_gr = subdf['Quarterly_Growth'].values[0]
inc_dec_named_tup = increase_decrease(annual_gr, quarter_gr)
inc_dec_annual_gr = inc_dec_named_tup.annual_growth_rate
inc_dec_quarterly_gr = inc_dec_named_tup.quarterly_growth_rate
comment = "The {} has {} by {:.1%} in {} {} compared to {} {}"\
.format(market, inc_dec_annual_gr, annual_gr, timeperiod, curr_date, timeperiod, prev_year)
comments_df = pd.DataFrame(columns=['Date','Comments'])
# comments_df['Date'] = [curr_date]
comments_df['Comments'] = [comment]
return comments_df
except (IndexError, KeyError) as e:
# this is for all those nan values which is empty
annual_gr = 0
quarter_gr = 0
I've been working on a python script and am having issues with some verification's I set up. I have this procedure file that has a function that uses a order number and a customer number to check some past history about the customers orders. Ive been testing live on our server and I keep failing the last if statement. The order number and customer number Im using does have more than one order and some are over 60 days so it should pass the test but it doesnt. Ive been looking over my code and I just cant see what could be causing this
edit: here are the print results of current and retrieved timestamps:
current_timestamp = 1531849617.921927
retrieved_timestamp = 1489622400
two_month_seconds = 5184000
one_month_seconds = 2592000
Python3
from classes import helper
from classes import api
from classes import order
from procedures import orderReleaseProcedure
import time
import datetime
import re
def verifyCustomer(customer_id, order_id):
self_helper = helper.Helper()
customer_blocked_reasons = self_helper.getConfig('customer_blocked_reasons')
order_statuses = self_helper.getConfig('order_statuses')
customer_is_blocked = False
self_api = api.Api()
self_order =order.Order(order_id)
status = {
'success' : 0,
'message' :'verify_payment_method'
}
results = self_api.which_api('orders?customer_id={}'.format(customer_id))
order_count = results['total_count']
if order_count > 1:
for result in results['orders']:
order_status_info= self_api.which_api('order_statuses/%d' % result['order_status_id'])
for customer_blocked_reason in customer_blocked_reasons:
if customer_blocked_reason in order_status_info['name']:
customer_is_blocked = True
order_id = 0
order_date = result['ordered_at']
two_month_seconds = (3600 * 24) * 60
one_month_seconds = (3600 * 24) * 30
stripped_date = order_date[:order_date.find("T")]
current_timestamp = time.time()
retrieved_timestamp = int(datetime.datetime.strptime(stripped_date, '%Y-%m-%d').strftime("%s"))
if retrieved_timestamp > (current_timestamp - one_month_seconds) and not customer_is_blocked:
status['success'] = 1
status['message'] = "Customer Verified with orders older than 30 days and no blocking reasons"
print(' 30 day check was triggered ')
print(status)
break
elif customer_is_blocked:
status_change_result = self_order.update_status(order_statuses['order_hold_manager_review'])
status['success'] = 1
status['message'] = "Changed order status to Order Hold - Manager Review"
print(' Customer block was triggered ')
print(status_change_result)
break
elif not retrieved_timestamp < (current_timestamp - two_month_seconds):
status['success'] = 0
status['message'] = "There is more than 1 order, and none are greater than 60 days, we need to check manually"
print(' 60 day check was triggered ')
print(status)
break
return status
I am trying to read and parse a data dictionary for the Census Bureau's American Community Survey Public Use Microsample data release, as found here.
It is reasonably well formated, although with a few lapses where a few explanatory notes are inserted.
I think my preferred outcome is to either get a dataframe with one row per variable, and serialize all value labels for a given variable into one dictionary stored in a value dictionary field in the same row (although a hierarchical json-like format would not be bad, but more complicated.
I got the following code:
import pandas as pd
import re
import urllib2
data = urllib2.urlopen('http://www.census.gov/acs/www/Downloads/data_documentation/pums/DataDict/PUMSDataDict13.txt')
## replace newline characters so we can use dots and find everything until a double
## carriage return (replaced to ||) with a lookahead assertion.
data=data.replace('\n','|')
datadict=pd.DataFrame(re.findall("([A-Z]{2,8})\s{2,9}([0-9]{1})\s{2,6}\|\s{2,4}([A-Za-z\-\(\) ]{3,85})",data,re.MULTILINE),columns=['variable','width','description'])
datadict.head(5)
+----+----------+-------+------------------------------------------------+
| | variable | width | description |
+----+----------+-------+------------------------------------------------+
| 0 | RT | 1 | Record Type |
+----+----------+-------+------------------------------------------------+
| 1 | SERIALNO | 7 | Housing unit |
+----+----------+-------+------------------------------------------------+
| 2 | DIVISION | 1 | Division code |
+----+----------+-------+------------------------------------------------+
| 3 | PUMA | 5 | Public use microdata area code (PUMA) based on |
+----+----------+-------+------------------------------------------------+
| 4 | REGION | 1 | Region code |
+----+----------+-------+------------------------------------------------+
| 5 | ST | 2 | State Code |
+----+----------+-------+------------------------------------------------+
So far so good. The list of variables is there, along with the width in characters of each.
I can expand this and get additional lines (where the value labels live), like so:
datadict_exp=pd.DataFrame(
re.findall("([A-Z]{2,9})\s{2,9}([0-9]{1})\s{2,6}\|\s{4}([A-Za-z\-\(\)\;\<\> 0-9]{2,85})\|\s{11,15}([a-z0-9]{0,2})[ ]\.([A-Za-z/\-\(\) ]{2,120})",
data,re.MULTILINE))
datadict_exp.head(5)
+----+----------+-------+---------------------------------------------------+---------+--------------+
| id | variable | width | description | value_1 | label_1 |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 0 | DIVISION | 1 | Division code | 0 | Puerto Rico |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 1 | REGION | 1 | Region code | 1 | Northeast |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 2 | ST | 2 | State Code | 1 | Alabama/AL |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 3 | NP | 2 | Number of person records following this housin... | 0 | Vacant unit |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 4 | TYPE | 1 | Type of unit | 1 | Housing unit |
+----+----------+-------+---------------------------------------------------+---------+--------------+
So that gets the first value and associated label. My regex issue is here how to repeat the multi-line match starting with \s{11,15} and to the end--i.e. some variables have tons of unique values (ST or state code is followed by some 50 lines, denoting the value and label for each state).
I changed early on the carriage return in the source file with a pipe, thinking that I could then shamelessly rely on the dot to match everything until a double carriage return, indicating the end of that particular variable, and here is where I got stuck.
So--how to repeat a multi-line pattern an arbitrary number of times.
(A complication for later is that some variables are not fully enumerated in the dictionary, but are shown with valid ranges of values. NP for example [number of persons associated with the same household], is denoted with ``02..20` following a description. If I don't account for this, my parsing will miss such entries, of course.)
This isn't a regex, but I parsed PUMSDataDict2013.txt and PUMS_Data_Dictionary_2009-2013.txt (Census ACS 2013 documentation, FTP server) with this Python 3x script below. I used pandas.DataFrame.from_dict and pandas.concat to create a hierarchical dataframe, also below.
Python 3x function to parse PUMSDataDict2013.txt and PUMS_Data_Dictionary_2009-2013.txt:
import collections
import os
def parse_pumsdatadict(path:str) -> collections.OrderedDict:
r"""Parse ACS PUMS Data Dictionaries.
Args:
path (str): Path to downloaded data dictionary.
Returns:
ddict (collections.OrderedDict): Parsed data dictionary with original
key order preserved.
Raises:
FileNotFoundError: Raised if `path` does not exist.
Notes:
* Only some data dictionaries have been tested.[^urls]
* Values are all strings. No data types are inferred from the
original file.
* Example structure of returned `ddict`:
ddict['title'] = '2013 ACS PUMS DATA DICTIONARY'
ddict['date'] = 'August 7, 2015'
ddict['record_types']['HOUSING RECORD']['RT']\
['length'] = '1'
['description'] = 'Record Type'
['var_codes']['H'] = 'Housing Record or Group Quarters Unit'
ddict['record_types']['HOUSING RECORD'][...]
ddict['record_types']['PERSON RECORD'][...]
ddict['notes'] =
['Note for both Industry and Occupation lists...',
'* In cases where the SOC occupation code ends...',
...]
References:
[^urls]: http://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/
PUMSDataDict2013.txt
PUMS_Data_Dictionary_2009-2013.txt
"""
# Check arguments.
if not os.path.exists(path):
raise FileNotFoundError(
"Path does not exist:\n{path}".format(path=path))
# Parse data dictionary.
# Note:
# * Data dictionary keys and values are "codes for variables",
# using the ACS terminology,
# https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.html
# * The data dictionary is not all encoded in UTF-8. Replace encoding
# errors when found.
# * Catch instances of inconsistently formatted data.
ddict = collections.OrderedDict()
with open(path, encoding='utf-8', errors='replace') as fobj:
# Data dictionary name is line 1.
ddict['title'] = fobj.readline().strip()
# Data dictionary date is line 2.
ddict['date'] = fobj.readline().strip()
# Initialize flags to catch lines.
(catch_var_name, catch_var_desc,
catch_var_code, catch_var_note) = (None, )*4
var_name = None
var_name_last = 'PWGTP80' # Necessary for unformatted end-of-file notes.
for line in fobj:
# Replace tabs with 4 spaces
line = line.replace('\t', ' '*4).rstrip()
# Record type is section header 'HOUSING RECORD' or 'PERSON RECORD'.
if (line.strip() == 'HOUSING RECORD'
or line.strip() == 'PERSON RECORD'):
record_type = line.strip()
if 'record_types' not in ddict:
ddict['record_types'] = collections.OrderedDict()
ddict['record_types'][record_type] = collections.OrderedDict()
# A newline precedes a variable name.
# A newline follows the last variable code.
elif line == '':
# Example inconsistent format case:
# WGTP54 5
# Housing Weight replicate 54
#
# -9999..09999 .Integer weight of housing unit
if (catch_var_code
and 'var_codes' not in ddict['record_types'][record_type][var_name]):
pass
# Terminate the previous variable block and look for the next
# variable name, unless past last variable name.
else:
catch_var_code = False
catch_var_note = False
if var_name != var_name_last:
catch_var_name = True
# Variable name is 1 line with 0 space indent.
# Variable name is followed by variable description.
# Variable note is optional.
# Variable note is preceded by newline.
# Variable note is 1+ lines.
# Variable note is followed by newline.
elif (catch_var_name and not line.startswith(' ')
and var_name != var_name_last):
# Example: "Note: Public use microdata areas (PUMAs) ..."
if line.lower().startswith('note:'):
var_note = line.strip() # type(var_note) == str
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
# Append a new note.
ddict['record_types'][record_type][var_name]['notes'].append(var_note)
catch_var_note = True
# Example: """
# Note: Public Use Microdata Areas (PUMAs) designate areas ...
# population. Use with ST for unique code. PUMA00 applies ...
# ...
# """
elif catch_var_note:
var_note = line.strip() # type(var_note) == str
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
# Concatenate to most recent note.
ddict['record_types'][record_type][var_name]['notes'][-1] += ' '+var_note
# Example: "NWAB 1 (UNEDITED - See 'Employment Status Recode' (ESR))"
else:
# type(var_note) == list
(var_name, var_len, *var_note) = line.strip().split(maxsplit=2)
ddict['record_types'][record_type][var_name] = collections.OrderedDict()
ddict['record_types'][record_type][var_name]['length'] = var_len
# Append a new note if exists.
if len(var_note) > 0:
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
ddict['record_types'][record_type][var_name]['notes'].append(var_note[0])
catch_var_name = False
catch_var_desc = True
var_desc_indent = None
# Variable description is 1+ lines with 1+ space indent.
# Variable description is followed by variable code(s).
# Variable code(s) is 1+ line with larger whitespace indent
# than variable description. Example:"""
# PUMA00 5
# Public use microdata area code (PUMA) based on Census 2000 definition for data
# collected prior to 2012. Use in combination with PUMA10.
# 00100..08200 .Public use microdata area codes
# 77777 .Combination of 01801, 01802, and 01905 in Louisiana
# -0009 .Code classification is Not Applicable because data
# .collected in 2012 or later
# """
# The last variable code is followed by a newline.
elif (catch_var_desc or catch_var_code) and line.startswith(' '):
indent = len(line) - len(line.lstrip())
# For line 1 of variable description.
if catch_var_desc and var_desc_indent is None:
var_desc_indent = indent
var_desc = line.strip()
ddict['record_types'][record_type][var_name]['description'] = var_desc
# For lines 2+ of variable description.
elif catch_var_desc and indent <= var_desc_indent:
var_desc = line.strip()
ddict['record_types'][record_type][var_name]['description'] += ' '+var_desc
# For lines 1+ of variable codes.
else:
catch_var_desc = False
catch_var_code = True
is_valid_code = None
if not line.strip().startswith('.'):
# Example case: "01 .One person record (one person in household or"
if ' .' in line:
(var_code, var_code_desc) = line.strip().split(
sep=' .', maxsplit=1)
is_valid_code = True
# Example inconsistent format case:"""
# bbbb. N/A (age less than 15 years; never married)
# """
elif '. ' in line:
(var_code, var_code_desc) = line.strip().split(
sep='. ', maxsplit=1)
is_valid_code = True
else:
raise AssertionError(
"Program error. Line unaccounted for:\n" +
"{line}".format(line=line))
if is_valid_code:
if 'var_codes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['var_codes'] = collections.OrderedDict()
ddict['record_types'][record_type][var_name]['var_codes'][var_code] = var_code_desc
# Example case: ".any person in group quarters)"
else:
var_code_desc = line.strip().lstrip('.')
ddict['record_types'][record_type][var_name]['var_codes'][var_code] += ' '+var_code_desc
# Example inconsistent format case:"""
# ADJHSG 7
# Adjustment factor for housing dollar amounts (6 implied decimal places)
# """
elif (catch_var_desc and
'description' not in ddict['record_types'][record_type][var_name]):
var_desc = line.strip()
ddict['record_types'][record_type][var_name]['description'] = var_desc
catch_var_desc = False
catch_var_code = True
# Example inconsistent format case:"""
# WGTP10 5
# Housing Weight replicate 10
# -9999..09999 .Integer weight of housing unit
# WGTP11 5
# Housing Weight replicate 11
# -9999..09999 .Integer weight of housing unit
# """
elif ((var_name == 'WGTP10' and 'WGTP11' in line)
or (var_name == 'YOEP12' and 'ANC' in line)):
# type(var_note) == list
(var_name, var_len, *var_note) = line.strip().split(maxsplit=2)
ddict['record_types'][record_type][var_name] = collections.OrderedDict()
ddict['record_types'][record_type][var_name]['length'] = var_len
if len(var_note) > 0:
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
ddict['record_types'][record_type][var_name]['notes'].append(var_note[0])
catch_var_name = False
catch_var_desc = True
var_desc_indent = None
else:
if (catch_var_name, catch_var_desc,
catch_var_code, catch_var_note) != (False, )*4:
raise AssertionError(
"Program error. All flags to catch lines should be set " +
"to `False` by end-of-file.")
if var_name != var_name_last:
raise AssertionError(
"Program error. End-of-file notes should only be read "+
"after `var_name_last` has been processed.")
if 'notes' not in ddict:
ddict['notes'] = list()
ddict['notes'].append(line)
return ddict
Create the hierarchical dataframe (formatted below as Jupyter Notebook cells):
In [ ]:
import pandas as pd
ddict = parse_pumsdatadict(path=r'/path/to/PUMSDataDict2013.txt')
tmp = dict()
for record_type in ddict['record_types']:
tmp[record_type] = pd.DataFrame.from_dict(ddict['record_types'][record_type], orient='index')
df_ddict = pd.concat(tmp, names=['record_type', 'var_name'])
df_ddict.head()
Out[ ]:
# Click "Run code snippet" below to render the output from `df_ddict.head()`.
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th></th>
<th>length</th>
<th>description</th>
<th>var_codes</th>
<th>notes</th>
</tr>
<tr>
<th>record_type</th>
<th>var_name</th>
<th></th>
<th></th>
<th></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<th rowspan="5" valign="top">HOUSING RECORD</th>
<th>ACCESS</th>
<td>1</td>
<td>Access to the Internet</td>
<td>{'b': 'N/A (GQ)', '1': 'Yes, with subscription...</td>
<td>NaN</td>
</tr>
<tr>
<th>ACR</th>
<td>1</td>
<td>Lot size</td>
<td>{'b': 'N/A (GQ/not a one-family house or mobil...</td>
<td>NaN</td>
</tr>
<tr>
<th>ADJHSG</th>
<td>7</td>
<td>Adjustment factor for housing dollar amounts (...</td>
<td>{'1000000': '2013 factor (1.000000)'}</td>
<td>[Note: The value of ADJHSG inflation-adjusts r...</td>
</tr>
<tr>
<th>ADJINC</th>
<td>7</td>
<td>Adjustment factor for income and earnings doll...</td>
<td>{'1007549': '2013 factor (1.007549)'}</td>
<td>[Note: The value of ADJINC inflation-adjusts r...</td>
</tr>
<tr>
<th>AGS</th>
<td>1</td>
<td>Sales of Agriculture Products (Yearly sales)</td>
<td>{'b': 'N/A (GQ/vacant/not a one family house o...</td>
<td>[Note: no adjustment factor is applied to AGS.]</td>
</tr>
</tbody>
</table>
I want to compare two strings in a python unittest which contain html.
Is there a method which outputs the result in a human friendly (diff like) version?
A simple method is to strip whitespace from the HTML and split it into a list. Python 2.7's unittest (or the backported unittest2) then gives a human-readable diff between the lists.
import re
def split_html(html):
return re.split(r'\s*\n\s*', html.strip())
def test_render_html():
expected = ['<div>', '...', '</div>']
got = split_html(render_html())
self.assertEqual(expected, got)
If I'm writing a test for working code, I usually first set expected = [], insert a self.maxDiff = None before the assert and let the test fail once. The expected list can then be copy-pasted from the test output.
You might need to tweak how whitespace is stripped depending on what your HTML looks like.
I submitted a patch to do this some years back. The patch was rejected but you can still view it on the python bug list.
I doubt you would want to hack your unittest.py to apply the patch (if it even still works after all this time), but here's the function for reducing two strings a manageable size while still keeping at least part of what differs. So long as all you didn't want the complete differences this might be what you want:
def shortdiff(x,y):
'''shortdiff(x,y)
Compare strings x and y and display differences.
If the strings are too long, shorten them to fit
in one line, while still keeping at least some difference.
'''
import difflib
LINELEN = 79
def limit(s):
if len(s) > LINELEN:
return s[:LINELEN-3] + '...'
return s
def firstdiff(s, t):
span = 1000
for pos in range(0, max(len(s), len(t)), span):
if s[pos:pos+span] != t[pos:pos+span]:
for index in range(pos, pos+span):
if s[index:index+1] != t[index:index+1]:
return index
left = LINELEN/4
index = firstdiff(x, y)
if index > left + 7:
x = x[:left] + '...' + x[index-4:index+LINELEN]
y = y[:left] + '...' + y[index-4:index+LINELEN]
else:
x, y = x[:LINELEN+1], y[:LINELEN+1]
left = 0
cruncher = difflib.SequenceMatcher(None)
xtags = ytags = ""
cruncher.set_seqs(x, y)
editchars = { 'replace': ('^', '^'),
'delete': ('-', ''),
'insert': ('', '+'),
'equal': (' ',' ') }
for tag, xi1, xi2, yj1, yj2 in cruncher.get_opcodes():
lx, ly = xi2 - xi1, yj2 - yj1
edits = editchars[tag]
xtags += edits[0] * lx
ytags += edits[1] * ly
# Include ellipsis in edits line.
if left:
xtags = xtags[:left] + '...' + xtags[left+3:]
ytags = ytags[:left] + '...' + ytags[left+3:]
diffs = [ x, xtags, y, ytags ]
if max([len(s) for s in diffs]) < LINELEN:
return '\n'.join(diffs)
diffs = [ limit(s) for s in diffs ]
return '\n'.join(diffs)
Maybe this is a quite 'verbose' solution. You could add a new 'equality function' for your user defined type (e.g: HTMLString) which you have to define first:
class HTMLString(str):
pass
Now you have to define a type equality function:
def assertHTMLStringEqual(first, second):
if first != second:
message = ... # TODO here: format your message, e.g a diff
raise AssertionError(message)
All you have to do is format your message as you like. You can also use a class method in your specific TestCase as a type equality function. This gives you more functionality to format your message, since unittest.TestCase does this a lot.
Now you have to register this equality function in your unittest.TestCase:
...
def __init__(self):
self.addTypeEqualityFunc(HTMLString, assertHTMLStringEqual)
The same for a class method:
...
def __init__(self):
self.addTypeEqualityFunc(HTMLString, 'assertHTMLStringEqual')
And now you can use it in your tests:
def test_something(self):
htmlstring1 = HTMLString(...)
htmlstring2 = HTMLString(...)
self.assertEqual(htmlstring1, htmlstring2)
This should work well with python 2.7.
I (the one asking this question) use BeautfulSoup now:
def assertEqualHTML(string1, string2, file1='', file2=''):
u'''
Compare two unicode strings containing HTML.
A human friendly diff goes to logging.error() if there
are not equal, and an exception gets raised.
'''
from BeautifulSoup import BeautifulSoup as bs
import difflib
def short(mystr):
max=20
if len(mystr)>max:
return mystr[:max]
return mystr
p=[]
for mystr, file in [(string1, file1), (string2, file2)]:
if not isinstance(mystr, unicode):
raise Exception(u'string ist not unicode: %r %s' % (short(mystr), file))
soup=bs(mystr)
pretty=soup.prettify()
p.append(pretty)
if p[0]!=p[1]:
for line in difflib.unified_diff(p[0].splitlines(), p[1].splitlines(), fromfile=file1, tofile=file2):
logging.error(line)
raise Exception('Not equal %s %s' % (file1, file2))