Match word irrespective of the case - python

Dataset:
> df
Id Clean_Data
1918916 Luxury Apartments consisting 11 towers Well equipped gymnasium Swimming Pool Toddler Pool Health Club Steam Room Sauna Jacuzzi Pool Table Chess Billiards room Carom Table Tennis indoor games
1495638 near medavakkam junction calm area near global hospital
1050651 No Pre Emi No Booking Amount No Floor Rise Charges No Processing Fee HLPROJECT HIGHLIGHTS
Below is the code which is successfully returning the matching words in ngrams from the list of values in Category.py
df['one_word_tokenized_text'] =df["Clean_Data"].str.split()
df['bigram'] = df['Clean_Data'].apply(lambda row: list(ngrams(word_tokenize(row), 2)))
df['trigram'] = df['Clean_Data'].apply(lambda row: list(ngrams(word_tokenize(row), 3)))
df['four_words'] = df['Clean_Data'].apply(lambda row: list(ngrams(word_tokenize(row), 4)))
token=pd.Series(df["one_word_tokenized_text"])
Lid=pd.Series(df["Id"])
matches= token.apply(lambda x: pd.Series(x).str.extractall("|".join(["({})".format(cat) for cat in Categories.HealthCare])))
match_list= [[m for m in match.values.ravel() if isinstance(m, str)] for match in matches]
match_df = pd.DataFrame({"ID":Lid,"jc1":match_list})
def match_word(feature, row):
categories = []
for bigram in row.bigram:
joined = ' '.join(bigram)
if joined in feature:
categories.append(joined)
for trigram in row.trigram:
joined = ' '.join(trigram)
if joined in feature:
categories.append(joined)
for fourwords in row.four_words:
joined = ' '.join(fourwords)
if joined in feature:
categories.append(joined)
return categories
match_df['Health1'] = df.apply(partial(match_word, HealthCare), axis=1)
match_df['HealthCare'] = match_df[match_df.columns[[1,2]]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)
Category.py
category = [('steam room','IN','HealthCare'),
('sauna','IN','HealthCare'),
('Jacuzzi','IN','HealthCare'),
('Aerobics','IN','HealthCare'),
('yoga room','IN','HealthCare'),]
HealthCare= [e1 for (e1, rel, e2) in category if e2=='HealthCare']
Output:
ID HealthCare
1918916 Jacuzzi
1495638
1050651 Aerobics, Jacuzzi, yoga room
Here if I mention the features in "Category list" in the exact letter case as mentioned in the dataset, then the code identifies it and returns the value, else it won't.
So I want my code to be case insensitive and even track "Steam Room","Sauna" under health category. I tried with ".lower()" function, but am not sure how to implement it.

edit 2: only category.py is updated
Category.py
category = [('steam room','IN','HealthCare'),
('sauna','IN','HealthCare'),
('jacuzzi','IN','HealthCare'),
('aerobics','IN','HealthCare'),
('Yoga room','IN','HealthCare'),
('booking','IN','HealthCare'),
]
category1 = [value[0].capitalize() for index, value in enumerate(category)]
category2 = [value[0].lower() for index, value in enumerate(category)]
test = []
test2 =[]
for index, value in enumerate(category1):
test.append((value, category[index][1],category[index][2]))
for index, value in enumerate(category2):
test2.append((value, category[index][1],category[index][2]))
category = category + test + test2
HealthCare = [e1 for (e1, rel, e2) in category if e2=='HealthCare']
Your unaltered dataset
import pandas as pd
from nltk import ngrams, word_tokenize
import Categories
from Categories import *
from functools import partial
data = {'Clean_Data':['Luxury Apartments consisting 11 towers Well equipped gymnasium Swimming Pool Toddler Pool Health Club Steam Room Sauna Jacuzzi Pool Table Chess Billiards room Carom Table Tennis indoor games',
'near medavakkam junction calm area near global hospital',
'No Pre Emi No Booking Amount No Floor Rise Charges No Processing Fee HLPROJECT HIGHLIGHTS '],
'Id' : [1918916, 1495638,1050651]}
df = pd.DataFrame(data)
df['one_word_tokenized_text'] =df["Clean_Data"].str.split()
df['bigram'] = df['Clean_Data'].apply(lambda row: list(ngrams(word_tokenize(row), 2)))
df['trigram'] = df['Clean_Data']).apply(lambda row: list(ngrams(word_tokenize(row), 3)))
df['four_words'] = df['Clean_Data'].apply(lambda row: list(ngrams(word_tokenize(row), 4)))
token=pd.Series(df["one_word_tokenized_text"])
Lid=pd.Series(df["Id"])
matches= token.apply(lambda x: pd.Series(x).str.extractall("|".join(["({})".format(cat) for cat in Categories.HealthCare])))
match_list= [[m for m in match.values.ravel() if isinstance(m, str)] for match in matches]
match_df = pd.DataFrame({"ID":Lid,"jc1":match_list})
def match_word(feature, row):
categories = []
for bigram in row.bigram:
joined = ' '.join(bigram)
if joined in feature:
categories.append(joined)
for trigram in row.trigram:
joined = ' '.join(trigram)
if joined in feature:
categories.append(joined)
for fourwords in row.four_words:
joined = ' '.join(fourwords)
if joined in feature:
categories.append(joined)
return categories
match_df['Health1'] = df.apply(partial(match_word, HealthCare), axis=1)
match_df['HealthCare'] = match_df[match_df.columns[[1,2]]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)enize(row), 4)))
Output
print match_df
+--------+----------------+-------------+------------------------------------+
|ID |jc1 |Health1 |HealthCare |
+--------+----------------+-------------+------------------------------------+
|1918916 |[sauna, jacuzzi]| |['sauna', 'jacuzzi'],['steam room'] |
+--------+----------------+-------------+------------------------------------+
|1495638 | | | |
+--------+----------------+-------------+------------------------------------+
|1050651 | [Booking] | | ['Booking'],[] | |
+--------+----------------+-------------+------------------------------------+

Related

Python Regex Capturing Multiple Matches in separate observations

I am trying to create variables location; contract items; contract code; federal aid using regex on the following text:
PAGE 1
BID OPENING DATE 07/25/18 FROM 0.2 MILES WEST OF ICE HOUSE 07/26/18 CONTRACT NUMBER 03-2F1304 ROAD TO 0.015 MILES WEST OF CONTRACT CODE 'A '
LOCATION 03-ED-50-39.5/48.7 DIVISION HIGHWAY ROAD 44 CONTRACT ITEMS
INSTALL SANDTRAPS AND PULLOUTS FEDERAL AID ACNH-P050-(146)E
PAGE 1
BID OPENING DATE 07/25/18 IN EL DORADO COUNTY AT VARIOUS 07/26/18 CONTRACT NUMBER 03-2H6804 LOCATIONS ALONG ROUTES 49 AND 193 CONTRACT CODE 'C ' LOCATION 03-ED-0999-VAR 13 CONTRACT ITEMS
TREE REMOVAL FEDERAL AID NONE
PAGE 1
BID OPENING DATE 07/25/18 IN LOS ANGELES, INGLEWOOD AND 07/26/18 CONTRACT NUMBER 07-296304 CULVER CITY, FROM I-105 TO PORT CONTRACT CODE 'B '
LOCATION 07-LA-405-R21.5/26.3 ROAD UNDERCROSSING 55 CONTRACT ITEMS
ROADWAY SAFETY IMPROVEMENT FEDERAL AID ACIM-405-3(056)E
This text is from one word file; I'll be looping my code on multiple doc files. In the text above are three location; contract items; contract code; federal aid pairs. But when I use regex to create variables, only the first instance of each pair is included.
The code I have right now is:
# imports
import os
import pandas as pd
import re
import docx2txt
import textract
import antiword
all_bod = []
all_cn = []
all_location = []
all_fedaid = []
all_contractcode = []
all_contractitems = []
all_file = []
text = ' PAGE 1
BID OPENING DATE 07/25/18 FROM 0.2 MILES WEST OF ICE HOUSE 07/26/18 CONTRACT NUMBER 03-2F1304 ROAD TO 0.015 MILES WEST OF CONTRACT CODE 'A '
LOCATION 03-ED-50-39.5/48.7 DIVISION HIGHWAY ROAD 44 CONTRACT ITEMS
INSTALL SANDTRAPS AND PULLOUTS FEDERAL AID ACNH-P050-(146)E
PAGE 1
BID OPENING DATE 07/25/18 IN EL DORADO COUNTY AT VARIOUS 07/26/18 CONTRACT NUMBER 03-2H6804 LOCATIONS ALONG ROUTES 49 AND 193 CONTRACT CODE 'C ' LOCATION 03-ED-0999-VAR 13 CONTRACT ITEMS
TREE REMOVAL FEDERAL AID NONE
PAGE 1
BID OPENING DATE 07/25/18 IN LOS ANGELES, INGLEWOOD AND 07/26/18 CONTRACT NUMBER 07-296304 CULVER CITY, FROM I-105 TO PORT CONTRACT CODE 'B '
LOCATION 07-LA-405-R21.5/26.3 ROAD UNDERCROSSING 55 CONTRACT ITEMS
ROADWAY SAFETY IMPROVEMENT FEDERAL AID ACIM-405-3(056)E'
bod1 = re.search('BID OPENING DATE \s+ (\d+\/\d+\/\d+)', text)
bod2 = re.search('BID OPENING DATE\n\n(\d+\/\d+\/\d+)', text)
if not(bod1 is None):
bod = bod1.group(1)
elif not(bod2 is None):
bod = bod2.group(1)
else:
bod = 'NA'
all_bod.append(bod)
# creating contract number
cn1 = re.search('CONTRACT NUMBER\n+(.*)', text)
cn2 = re.search('CONTRACT NUMBER\s+(.........)', text)
if not(cn1 is None):
cn = cn1.group(1)
elif not(cn2 is None):
cn = cn2.group(1)
else:
cn = 'NA'
all_cn.append(cn)
# location
location1 = re.search('LOCATION \s+\S+', text)
location2 = re.search('LOCATION \n+\S+', text)
if not(location1 is None):
location = location1.group(0)
elif not(location2 is None):
location = location2.group(0)
else:
location = 'NA'
all_location.append(location)
# federal aid
fedaid = re.search('FEDERAL AID\s+\S+', text)
fedaid = fedaid.group(0)
all_fedaid.append(fedaid)
# contract code
contractcode = re.search('CONTRACT CODE\s+\S+', text)
contractcode = contractcode.group(0)
all_contractcode.append(contractcode)
# contract items
contractitems = re.search('\d+ CONTRACT ITEMS', text)
contractitems = contractitems.group(0)
all_contractitems.append(contractitems)
This code parses the only first instance of these variables in the text.
contract-number
location
contract-items
contract-code
federal-aid
03-2F1304
03-ED-50-39.5/48.7
44
A
ACNH-P050-(146)E
But, I am trying to figure out a way to get all possible instances in different observations.
contract-number
location
contract-items
contract-code
federal-aid
03-2F1304
03-ED-50-39.5/48.7
44
A
ACNH-P050-(146)E
03-2H6804
03-ED-0999-VAR
13
C
NONE
07-296304
07-LA-405-R21.5/26.3
55
B
ACIM-405-3(056)E
The all_variables in the code are for looping over multiple word files - we can ignore that if we want :).
Any leads would be super helpful. Thanks so much!
import re
data = []
df = pd.DataFrame()
regex_contract_number =r"(?:CONTRACT NUMBER\s+(?P<contract_number>\S+?)\s)"
regex_location = r"(?:LOCATION\s+(?P<location>\S+))"
regex_contract_items = r"(?:(?P<contract_items>\d+)\sCONTRACT ITEMS)"
regex_federal_aid =r"(?:FEDERAL AID\s+(?P<federal_aid>\S+?)\s)"
regex_contract_code =r"(?:CONTRACT CODE\s+\'(?P<contract_code>\S+?)\s)"
regexes = [regex_contract_number,regex_location,regex_contract_items,regex_federal_aid,regex_contract_code]
for regex in regexes:
for match in re.finditer(regex, text):
data.append(match.groupdict())
df = pd.concat([df, pd.DataFrame(data)], axis=1)
data = []
df

How to create function to pass lat long in api call for get weather data

I try to get the data from pyOWM package using city name but in some cases because of city typo error
not getting data & it breaks the process.
I want to get the weather data using lat-long but don't know how to set function for it.
Df1:
-----
User City State Zip Lat Long
-----------------------------------------------------------------------------
A Kuala Lumpur Wilayah Persekutuan 50100 5.3288907 103.1344397
B Dublin County Dublin NA 50.2030506 14.5509842
C Oconomowoc NA NA 53.3640384 -6.1953066
D Mumbai Maharashtra 400067 19.2177166 72.9708833
E Mratin Stredocesky kraj 250 63 40.7560585 -5.6924778
.
.
.
----------------------------------
Code:
--------
import time
from tqdm.notebook import tqdm
import pyowm
from pyowm.utils import config
from pyowm.utils import timestamps
cities = Df1["City"].unique().tolist()
cities1 = cities [:5]
owm = pyowm.OWM('bee8db7d50a4b777bfbb9f47d9beb7d0')
mgr = owm.weather_manager()
'''
Step-1 Define list where save the data
'''
list_wind_Speed =[]
list_tempreture =[]
list_max_temp =[]
list_min_temp =[]
list_humidity =[]
list_pressure =[]
list_city = []
list_cloud=[]
list_status =[]
list_rain =[]
'''
Step-2 Fetch data
'''
j=0
for city in tqdm(cities1):
j=+1
if j < 60:
# one_call_obs = owm.weather_at_coords(52.5244, 13.4105).weather
# one_call_obs.current.humidity
observation = mgr.weather_at_place(city)
l = observation.weather
list_city.append(city)
list_wind_Speed.append(l.wind()['speed'])
list_tempreture.append(l.temperature('celsius')['temp'])
list_max_temp.append(l.temperature('celsius')['temp_max'])
list_min_temp.append(l.temperature('celsius')['temp_min'])
list_humidity.append(l.humidity)
list_pressure.append(l.pressure['press'])
list_cloud.append(l.clouds)
list_rain.append(l.rain)
else:
time.sleep(60)
j=0
'''
Step-3 Blank data frame and store data in that
'''
df2 = pd.DataFrame()
df2["City"] = list_city
df2["Temp"] = list_tempreture
df2["Max_Temp"] = list_max_temp
df2["Min_Temp"] = list_min_temp
df2["Cloud"] = list_cloud
df2["Humidity"] = list_humidity
df2["Pressure"] = list_pressure
df2["Status"] = list_status
df2["Rain"] = list_status
df2
From the above code, I get the result as below,
City | Temp |Max_Temp|Min_Temp|Cloud |Humidity|Pressure |Status | Rain
------------------------------------------------------------------------------------------
Kuala Lumpur|29.22 |30.00 |27.78 | 20 |70 |1007 | moderate rain | moderate rain
Dublin |23.12 |26.43 |22.34 | 15 |89 | 978 | cloudy | cloudy
...
Now because of some city typo error processes getting stop,
Looking for an alternate solution of it and try to get weather data from Lat-Long but don't know how to set function for pass lat & long column data.
Df1 = {'User':['A','B','C','D','E'],
'City':['Kuala Lumpur','Dublin','Oconomowoc','Mumbai','Mratin'],
'State':['Wilayah Persekutuan','County Dublin',NA,1'Maharashtra','Stredocesky kraj'],
'Zip': [50100,NA,NA,400067,250 63],
'Lat':[5.3288907,50.2030506,53.3640384,19.2177166,40.7560585],
'Long':[103.1344397,14.5509842,-6.1953066,72.9708833,-5.6924778]}
# Try to use this code to get wather data
# one_call_obs = owm.weather_at_coords(52.5244, 13.4105).weather
# one_call_obs.current.humidity
Expected Result
--------------
User | City | Lat | Long | Temp | Cloud | Humidity | Pressure | Rain | Status
-----------------------------------------------------------------------------
Catch the error if a city is not found, parse the lat/lon from the dataframe. Use that lat/lon to create a bounding box and use weather_at_places_in_bbox to get a list of observations in that area.
import time
from tqdm.notebook import tqdm
import pyowm
from pyowm.utils import config
from pyowm.utils import timestamps
import pandas as pd
from pyowm.commons.exceptions import NotFoundError, ParseAPIResponseError
df1 = pd.DataFrame({'City': ('Kuala Lumpur', 'Dublin', 'Oconomowoc', 'Mumbai', 'C airo', 'Mratin'),
'Lat': ('5.3288907', '50.2030506', '53.3640384', '19.2177166', '30.22', '40.7560585'),
'Long': ('103.1344397', '14.5509842', '-6.1953066', '72.9708833', '31', '-5.6924778')})
cities = df1["City"].unique().tolist()
owm = pyowm.OWM('bee8db7d50a4b777bfbb9f47d9beb7d0')
mgr = owm.weather_manager()
for city in cities:
try:
observation = mgr.weather_at_place(city)
# print(city, observation)
except NotFoundError:
# get city by lat/lon
lat_top = float(df1.loc[df1['City'] == city, 'Lat'])
lon_left = float(df1.loc[df1['City'] == city, 'Long'])
lat_bottom = lat_top - 0.3
lon_right = lon_left + 0.3
try:
observations = mgr.weather_at_places_in_bbox(lon_left, lat_bottom, lon_right, lat_top, zoom=5)
observation = observations[0]
except ParseAPIResponseError:
raise RuntimeError(f"Couldn't find {city} at lat: {lat_top} / lon: {lon_right}, try tweaking the bounding box")
weather = observation.weather
temp = weather.temperature('celsius')['temp']
print(f"The current temperature in {city} is {temp}")

Search a series for a word. Return that word and N others in a new column?

Okay, I need help. I created a function to search a string for a specific word. If the function finds the search_word it will return the word the and N words that precede it. The function works fine with my test strings but I cannot figure out how to apply the function to an entire series?
My goal is to create a new column in the data frame that contains the n_words_prior whenever the search_word exists.
n_words_prior = []
test = "New School District, Dale County"
def n_before_string(string, search_word, N):
global n_words_prior
n_words_prior = []
found_word = string.find(search_word)
if found_word == -1: return ""
sentence= string[0:found_word]
n_words_prior = sentence.split()[N:]
n_words_prior.append(search_word)
return n_words_prior
The current dataframe looks like this:
data = [['Alabama', 'New School District, Dale County'],
['Alaska', 'Matanuska-Susitna Borough'],
['Arizona', 'Pima County - Tuscon Unified School District']]
df = pd.DataFrame(data, columns = ['State', 'Place'])
The improved function would take the inputs 'Place','County',-1 and create the following result.
improved_function(column, search_word, N)
new_data = [['Alabama', 'New School District, Dale County','Dale County'],
['Alaska', 'Matanuska-Susitna Borough', ''],
['Arizona', 'Pima County - Tuscon Unified School District','Pima County']]
new_df = pd.DataFrame(new_data, columns = ['State', 'Place','Result'])
I thought embedding this function would help, but it has only made things more confusing.
def fast_add(place, search_word):
df[search_word] = df[Place].str.contains(search_word).apply(lambda search_word: 1 if search_word == True else 0)
def fun(sentence, search_word, n):
"""Return search_word and n preceding words from sentence."""
words = sentence.split()
for i,word in enumerate(words):
if word == search_word:
return ' '.join(words[i-n:i+1])
return ''
Example:
df['Result'] = df.Place.apply(lambda x: fun(x, 'County', 1))
Result:
State Place Result
0 Alabama New School District, Dale County Dale County
1 Alaska Matanuska-Susitna Borough
2 Arizona Pima County - Tuscon Unified School District Pima County

I am trying to hide data on the basis of certain condition

I am trying to hide the data for a single column if the column contain value that is in exceptionList then it should escape and move to next but somehow i am not able to hide that and throws error
if(x in exceptionList):
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
here is mine code
data = [['NISAMANEE ROWELL', '9198762345','98 Oxford Ave.Elk Grove Village, IL 60007'], ['ALICE BAISDEN', '8756342865', '94 Valley Rd.Miami Gardens, FL 33056'], ['MARC COGNETTI', '9198762345', '221 Summer CircleGreer, SC 29650'], ['JOHNS HOPKINS HEALTHCARE', '9654987642', '8522 Pendergast AvenueVilla Park, IL 60181']]
df = pd.DataFrame(data, columns = ['Name', 'Number', 'Address'])
df
def title_format(inp):
return inp.str.title()
def new(x):
#x = input('Enter your column name')
#x = x.title()
x = title_format(x)
print(x)
exc_list=['Mackesson Inc','Care','Healthcare','Henery Schien','Besse','LLC','CandP','INC','LTD','PHARMACY','PHARMACEUTICAL','HOSPITAL','COMPANY','ELECTRONICS','APP','VOLUNTEERS','SPECIALITIES','APPLIANCE','EXPRESS','MAGAZINE','SUPPLY','ENDOSCOPY','NETWandK','SCHOOL','AT&T','SOLUTIONS','SANITATION','SYSTEMS','COMPOUNDING','CLINIC','UTILITIES','DEPARTMENT','CREATIVE','PIN','employment','consultant','units','label','machine','anesthesia','services','medical','community','plaza','tech','bipolar','brand','commerce','testing','inspection','killer','plus','electric','division','diagnostic','materials','imaging','international','district','chamber','city','products','essentials','life','scissand','leasing','units','health','healthcare','surgical','enterprises','print','radiology','water','screens','telecom']
exceptionList = [z.title() for z in exc_list]
if(x in exceptionList):
return x
else:
return x.str.replace(x, 'X' * random.randrange(3, 8))
#new(df.Name.astype(str))
new(df['Name'].astype(str))
As far as i understand you want, i change several lines in your code:
import pandas as pd
import random
data = [['NISAMANEE ROWELL', '9198762345','98 Oxford Ave.Elk Grove Village, IL 60007'], ['ALICE BAISDEN', '8756342865', '94 Valley Rd.Miami Gardens, FL 33056'], ['MARC COGNETTI', '9198762345', '221 Summer CircleGreer, SC 29650'], ['Healthcare', '9654987642', '8522 Pendergast AvenueVilla Park, IL 60181']]
df = pd.DataFrame(data, columns = ['Name', 'Number', 'Address'])
def title_format(inp):
return inp.str.title()
def new(x):
#x = input('Enter your column name')
#x = x.title()
x = title_format(x)
print(x)
exc_list=['Mackesson Inc','Care','Healthcare','Henery Schien','Besse','LLC','CandP','INC','LTD','PHARMACY','PHARMACEUTICAL','HOSPITAL','COMPANY','ELECTRONICS','APP','VOLUNTEERS','SPECIALITIES','APPLIANCE','EXPRESS','MAGAZINE','SUPPLY','ENDOSCOPY','NETWandK','SCHOOL','AT&T','SOLUTIONS','SANITATION','SYSTEMS','COMPOUNDING','CLINIC','UTILITIES','DEPARTMENT','CREATIVE','PIN','employment','consultant','units','label','machine','anesthesia','services','medical','community','plaza','tech','bipolar','brand','commerce','testing','inspection','killer','plus','electric','division','diagnostic','materials','imaging','international','district','chamber','city','products','essentials','life','scissand','leasing','units','health','healthcare','surgical','enterprises','print','radiology','water','screens','telecom']
exceptionList = [z.title() for z in exc_list]
match = [x1 in exceptionList for x1 in x]
df.loc[match,'Name'] = ['X' * random.randrange(3, 8) for a in range(sum(match))]
# return x
# else:
# return x.str.replace(x, 'X' * random.randrange(3, 8))
#new(df.Name.astype(str))
new(df['Name'].astype(str))
df
Out[1]:
Name Number Address
0 NISAMANEE ROWELL 9198762345 98 Oxford Ave.Elk Grove Village, IL 60007
1 ALICE BAISDEN 8756342865 94 Valley Rd.Miami Gardens, FL 33056
2 MARC COGNETTI 9198762345 221 Summer CircleGreer, SC 29650
3 XXXXXXX 9654987642 8522 Pendergast AvenueVilla Park, IL 60181
More optimal way to do the same
exc_list = [x.title() for x in exc_list]
df['Name'] = df['Name'].map(str.title)
df['match'] = [nn in exc_list for nn in df['Name']]
df.loc[df['match'] == True,'Name'] = ['X' * random.randrange(3, 8) for a in range(sum(df['match']))]
To hide first 3 symbols
exc_list = [x.title() for x in exc_list]
df['Name'] = df['Name'].map(str.title)
df['match'] = [nn in exc_list for nn in df['Name']]
df['NameIf'] = list(zip(df['Name'], [(lambda x: 'XXX' + s[3:] if len(x)>3 else 'XXX')(s) for s in df['Name']]))
df['Name'] = [n[0][n[1]] for n in list(zip(df['NameIf'],df['match'].astype(int)))]
df = df.drop(['NameIf', 'match'], axis = 1)
df
To hide whole row
exc_list = [x.title() for x in exc_list]
df['Name'] = df['Name'].map(str.title)
df['match'] = [nn in exc_list for nn in df['Name']]
hide_row = {c:'XXX' for c in df.columns}
df[df['match'] != True].merge(pd.DataFrame(hide_row, index = df[df['match'] == True].index), how = 'outer')
short explanation
# Step 1. this gives you DataFrame without matching
df[df['match'] != True]
Out[3]:
Name Number Address match
0 Nisamanee Rowell 9198762345 98 Oxford Ave.Elk Grove Village, IL 60007 False
1 Alice Baisden 8756342865 94 Valley Rd.Miami Gardens, FL 33056 False
2 Marc Cognetti 9198762345 221 Summer CircleGreer, SC 29650 False
# Step 2. this opposite gives you DataFrame with matching
df[df['match'] == True]
Out[4]:
Name Number Address match
3 Healthcare 9654987642 8522 Pendergast AvenueVilla Park, IL 60181 True
# Step 3. but you take only index from Step 2. And create new dataframe with indexes and 'XXX' columns
hide_row = {c:'XXX' for c in df.columns}
pd.DataFrame(hide_row, index = df[df['match'] == True].index)
Out[5]:
Name Number Address match
3 XXX XXX XXX XXX
# Step 4. And then you just merge two dataframes from step 1 and step 3 by indexes
df[df['match'] != True].merge(pd.DataFrame(hide_row, index = df[df['match'] == True].index), how = 'outer')
Just small change in your code will work, mind you that's not optimal but works just fine.
data = [['NISAMANEE ROWELL', '9198762345','98 Oxford Ave.Elk Grove Village, IL 60007'], ['ALICE BAISDEN', '8756342865', '94 Valley Rd.Miami Gardens, FL 33056'], ['MARC COGNETTI', '9198762345', '221 Summer CircleGreer, SC 29650'], ['Healthcare', '9654987642', '8522 Pendergast AvenueVilla Park, IL 60181']]
df = pd.DataFrame(data, columns = ['Name', 'Number', 'Address'])
df
def title_format(inp):
return inp.title()
def new(x):
#x = input('Enter your column name')
#x = x.title()
x = title_format(x)
print(x)
exc_list=['Mackesson Inc','Care','Healthcare','Henery Schien','Besse','LLC','CandP','INC','LTD','PHARMACY','PHARMACEUTICAL','HOSPITAL','COMPANY','ELECTRONICS','APP','VOLUNTEERS','SPECIALITIES','APPLIANCE','EXPRESS','MAGAZINE','SUPPLY','ENDOSCOPY','NETWandK','SCHOOL','AT&T','SOLUTIONS','SANITATION','SYSTEMS','COMPOUNDING','CLINIC','UTILITIES','DEPARTMENT','CREATIVE','PIN','employment','consultant','units','label','machine','anesthesia','services','medical','community','plaza','tech','bipolar','brand','commerce','testing','inspection','killer','plus','electric','division','diagnostic','materials','imaging','international','district','chamber','city','products','essentials','life','scissand','leasing','units','health','healthcare','surgical','enterprises','print','radiology','water','screens','telecom']
exceptionList = [z.title() for z in exc_list]
if(x in exceptionList):
return x
else:
return x.replace(x, 'X' * random.randrange(3, 8))
#new(df.Name.astype(str))
df['Name'] = df['Name'].apply(new)

regex to parse well-formated multi-line data dictionary

I am trying to read and parse a data dictionary for the Census Bureau's American Community Survey Public Use Microsample data release, as found here.
It is reasonably well formated, although with a few lapses where a few explanatory notes are inserted.
I think my preferred outcome is to either get a dataframe with one row per variable, and serialize all value labels for a given variable into one dictionary stored in a value dictionary field in the same row (although a hierarchical json-like format would not be bad, but more complicated.
I got the following code:
import pandas as pd
import re
import urllib2
data = urllib2.urlopen('http://www.census.gov/acs/www/Downloads/data_documentation/pums/DataDict/PUMSDataDict13.txt')
## replace newline characters so we can use dots and find everything until a double
## carriage return (replaced to ||) with a lookahead assertion.
data=data.replace('\n','|')
datadict=pd.DataFrame(re.findall("([A-Z]{2,8})\s{2,9}([0-9]{1})\s{2,6}\|\s{2,4}([A-Za-z\-\(\) ]{3,85})",data,re.MULTILINE),columns=['variable','width','description'])
datadict.head(5)
+----+----------+-------+------------------------------------------------+
| | variable | width | description |
+----+----------+-------+------------------------------------------------+
| 0 | RT | 1 | Record Type |
+----+----------+-------+------------------------------------------------+
| 1 | SERIALNO | 7 | Housing unit |
+----+----------+-------+------------------------------------------------+
| 2 | DIVISION | 1 | Division code |
+----+----------+-------+------------------------------------------------+
| 3 | PUMA | 5 | Public use microdata area code (PUMA) based on |
+----+----------+-------+------------------------------------------------+
| 4 | REGION | 1 | Region code |
+----+----------+-------+------------------------------------------------+
| 5 | ST | 2 | State Code |
+----+----------+-------+------------------------------------------------+
So far so good. The list of variables is there, along with the width in characters of each.
I can expand this and get additional lines (where the value labels live), like so:
datadict_exp=pd.DataFrame(
re.findall("([A-Z]{2,9})\s{2,9}([0-9]{1})\s{2,6}\|\s{4}([A-Za-z\-\(\)\;\<\> 0-9]{2,85})\|\s{11,15}([a-z0-9]{0,2})[ ]\.([A-Za-z/\-\(\) ]{2,120})",
data,re.MULTILINE))
datadict_exp.head(5)
+----+----------+-------+---------------------------------------------------+---------+--------------+
| id | variable | width | description | value_1 | label_1 |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 0 | DIVISION | 1 | Division code | 0 | Puerto Rico |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 1 | REGION | 1 | Region code | 1 | Northeast |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 2 | ST | 2 | State Code | 1 | Alabama/AL |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 3 | NP | 2 | Number of person records following this housin... | 0 | Vacant unit |
+----+----------+-------+---------------------------------------------------+---------+--------------+
| 4 | TYPE | 1 | Type of unit | 1 | Housing unit |
+----+----------+-------+---------------------------------------------------+---------+--------------+
So that gets the first value and associated label. My regex issue is here how to repeat the multi-line match starting with \s{11,15} and to the end--i.e. some variables have tons of unique values (ST or state code is followed by some 50 lines, denoting the value and label for each state).
I changed early on the carriage return in the source file with a pipe, thinking that I could then shamelessly rely on the dot to match everything until a double carriage return, indicating the end of that particular variable, and here is where I got stuck.
So--how to repeat a multi-line pattern an arbitrary number of times.
(A complication for later is that some variables are not fully enumerated in the dictionary, but are shown with valid ranges of values. NP for example [number of persons associated with the same household], is denoted with ``02..20` following a description. If I don't account for this, my parsing will miss such entries, of course.)
This isn't a regex, but I parsed PUMSDataDict2013.txt and PUMS_Data_Dictionary_2009-2013.txt (Census ACS 2013 documentation, FTP server) with this Python 3x script below. I used pandas.DataFrame.from_dict and pandas.concat to create a hierarchical dataframe, also below.
Python 3x function to parse PUMSDataDict2013.txt and PUMS_Data_Dictionary_2009-2013.txt:
import collections
import os
def parse_pumsdatadict(path:str) -> collections.OrderedDict:
r"""Parse ACS PUMS Data Dictionaries.
Args:
path (str): Path to downloaded data dictionary.
Returns:
ddict (collections.OrderedDict): Parsed data dictionary with original
key order preserved.
Raises:
FileNotFoundError: Raised if `path` does not exist.
Notes:
* Only some data dictionaries have been tested.[^urls]
* Values are all strings. No data types are inferred from the
original file.
* Example structure of returned `ddict`:
ddict['title'] = '2013 ACS PUMS DATA DICTIONARY'
ddict['date'] = 'August 7, 2015'
ddict['record_types']['HOUSING RECORD']['RT']\
['length'] = '1'
['description'] = 'Record Type'
['var_codes']['H'] = 'Housing Record or Group Quarters Unit'
ddict['record_types']['HOUSING RECORD'][...]
ddict['record_types']['PERSON RECORD'][...]
ddict['notes'] =
['Note for both Industry and Occupation lists...',
'* In cases where the SOC occupation code ends...',
...]
References:
[^urls]: http://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/
PUMSDataDict2013.txt
PUMS_Data_Dictionary_2009-2013.txt
"""
# Check arguments.
if not os.path.exists(path):
raise FileNotFoundError(
"Path does not exist:\n{path}".format(path=path))
# Parse data dictionary.
# Note:
# * Data dictionary keys and values are "codes for variables",
# using the ACS terminology,
# https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.html
# * The data dictionary is not all encoded in UTF-8. Replace encoding
# errors when found.
# * Catch instances of inconsistently formatted data.
ddict = collections.OrderedDict()
with open(path, encoding='utf-8', errors='replace') as fobj:
# Data dictionary name is line 1.
ddict['title'] = fobj.readline().strip()
# Data dictionary date is line 2.
ddict['date'] = fobj.readline().strip()
# Initialize flags to catch lines.
(catch_var_name, catch_var_desc,
catch_var_code, catch_var_note) = (None, )*4
var_name = None
var_name_last = 'PWGTP80' # Necessary for unformatted end-of-file notes.
for line in fobj:
# Replace tabs with 4 spaces
line = line.replace('\t', ' '*4).rstrip()
# Record type is section header 'HOUSING RECORD' or 'PERSON RECORD'.
if (line.strip() == 'HOUSING RECORD'
or line.strip() == 'PERSON RECORD'):
record_type = line.strip()
if 'record_types' not in ddict:
ddict['record_types'] = collections.OrderedDict()
ddict['record_types'][record_type] = collections.OrderedDict()
# A newline precedes a variable name.
# A newline follows the last variable code.
elif line == '':
# Example inconsistent format case:
# WGTP54 5
# Housing Weight replicate 54
#
# -9999..09999 .Integer weight of housing unit
if (catch_var_code
and 'var_codes' not in ddict['record_types'][record_type][var_name]):
pass
# Terminate the previous variable block and look for the next
# variable name, unless past last variable name.
else:
catch_var_code = False
catch_var_note = False
if var_name != var_name_last:
catch_var_name = True
# Variable name is 1 line with 0 space indent.
# Variable name is followed by variable description.
# Variable note is optional.
# Variable note is preceded by newline.
# Variable note is 1+ lines.
# Variable note is followed by newline.
elif (catch_var_name and not line.startswith(' ')
and var_name != var_name_last):
# Example: "Note: Public use microdata areas (PUMAs) ..."
if line.lower().startswith('note:'):
var_note = line.strip() # type(var_note) == str
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
# Append a new note.
ddict['record_types'][record_type][var_name]['notes'].append(var_note)
catch_var_note = True
# Example: """
# Note: Public Use Microdata Areas (PUMAs) designate areas ...
# population. Use with ST for unique code. PUMA00 applies ...
# ...
# """
elif catch_var_note:
var_note = line.strip() # type(var_note) == str
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
# Concatenate to most recent note.
ddict['record_types'][record_type][var_name]['notes'][-1] += ' '+var_note
# Example: "NWAB 1 (UNEDITED - See 'Employment Status Recode' (ESR))"
else:
# type(var_note) == list
(var_name, var_len, *var_note) = line.strip().split(maxsplit=2)
ddict['record_types'][record_type][var_name] = collections.OrderedDict()
ddict['record_types'][record_type][var_name]['length'] = var_len
# Append a new note if exists.
if len(var_note) > 0:
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
ddict['record_types'][record_type][var_name]['notes'].append(var_note[0])
catch_var_name = False
catch_var_desc = True
var_desc_indent = None
# Variable description is 1+ lines with 1+ space indent.
# Variable description is followed by variable code(s).
# Variable code(s) is 1+ line with larger whitespace indent
# than variable description. Example:"""
# PUMA00 5
# Public use microdata area code (PUMA) based on Census 2000 definition for data
# collected prior to 2012. Use in combination with PUMA10.
# 00100..08200 .Public use microdata area codes
# 77777 .Combination of 01801, 01802, and 01905 in Louisiana
# -0009 .Code classification is Not Applicable because data
# .collected in 2012 or later
# """
# The last variable code is followed by a newline.
elif (catch_var_desc or catch_var_code) and line.startswith(' '):
indent = len(line) - len(line.lstrip())
# For line 1 of variable description.
if catch_var_desc and var_desc_indent is None:
var_desc_indent = indent
var_desc = line.strip()
ddict['record_types'][record_type][var_name]['description'] = var_desc
# For lines 2+ of variable description.
elif catch_var_desc and indent <= var_desc_indent:
var_desc = line.strip()
ddict['record_types'][record_type][var_name]['description'] += ' '+var_desc
# For lines 1+ of variable codes.
else:
catch_var_desc = False
catch_var_code = True
is_valid_code = None
if not line.strip().startswith('.'):
# Example case: "01 .One person record (one person in household or"
if ' .' in line:
(var_code, var_code_desc) = line.strip().split(
sep=' .', maxsplit=1)
is_valid_code = True
# Example inconsistent format case:"""
# bbbb. N/A (age less than 15 years; never married)
# """
elif '. ' in line:
(var_code, var_code_desc) = line.strip().split(
sep='. ', maxsplit=1)
is_valid_code = True
else:
raise AssertionError(
"Program error. Line unaccounted for:\n" +
"{line}".format(line=line))
if is_valid_code:
if 'var_codes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['var_codes'] = collections.OrderedDict()
ddict['record_types'][record_type][var_name]['var_codes'][var_code] = var_code_desc
# Example case: ".any person in group quarters)"
else:
var_code_desc = line.strip().lstrip('.')
ddict['record_types'][record_type][var_name]['var_codes'][var_code] += ' '+var_code_desc
# Example inconsistent format case:"""
# ADJHSG 7
# Adjustment factor for housing dollar amounts (6 implied decimal places)
# """
elif (catch_var_desc and
'description' not in ddict['record_types'][record_type][var_name]):
var_desc = line.strip()
ddict['record_types'][record_type][var_name]['description'] = var_desc
catch_var_desc = False
catch_var_code = True
# Example inconsistent format case:"""
# WGTP10 5
# Housing Weight replicate 10
# -9999..09999 .Integer weight of housing unit
# WGTP11 5
# Housing Weight replicate 11
# -9999..09999 .Integer weight of housing unit
# """
elif ((var_name == 'WGTP10' and 'WGTP11' in line)
or (var_name == 'YOEP12' and 'ANC' in line)):
# type(var_note) == list
(var_name, var_len, *var_note) = line.strip().split(maxsplit=2)
ddict['record_types'][record_type][var_name] = collections.OrderedDict()
ddict['record_types'][record_type][var_name]['length'] = var_len
if len(var_note) > 0:
if 'notes' not in ddict['record_types'][record_type][var_name]:
ddict['record_types'][record_type][var_name]['notes'] = list()
ddict['record_types'][record_type][var_name]['notes'].append(var_note[0])
catch_var_name = False
catch_var_desc = True
var_desc_indent = None
else:
if (catch_var_name, catch_var_desc,
catch_var_code, catch_var_note) != (False, )*4:
raise AssertionError(
"Program error. All flags to catch lines should be set " +
"to `False` by end-of-file.")
if var_name != var_name_last:
raise AssertionError(
"Program error. End-of-file notes should only be read "+
"after `var_name_last` has been processed.")
if 'notes' not in ddict:
ddict['notes'] = list()
ddict['notes'].append(line)
return ddict
Create the hierarchical dataframe (formatted below as Jupyter Notebook cells):
In [ ]:
import pandas as pd
ddict = parse_pumsdatadict(path=r'/path/to/PUMSDataDict2013.txt')
tmp = dict()
for record_type in ddict['record_types']:
tmp[record_type] = pd.DataFrame.from_dict(ddict['record_types'][record_type], orient='index')
df_ddict = pd.concat(tmp, names=['record_type', 'var_name'])
df_ddict.head()
Out[ ]:
# Click "Run code snippet" below to render the output from `df_ddict.head()`.
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th></th>
<th>length</th>
<th>description</th>
<th>var_codes</th>
<th>notes</th>
</tr>
<tr>
<th>record_type</th>
<th>var_name</th>
<th></th>
<th></th>
<th></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<th rowspan="5" valign="top">HOUSING RECORD</th>
<th>ACCESS</th>
<td>1</td>
<td>Access to the Internet</td>
<td>{'b': 'N/A (GQ)', '1': 'Yes, with subscription...</td>
<td>NaN</td>
</tr>
<tr>
<th>ACR</th>
<td>1</td>
<td>Lot size</td>
<td>{'b': 'N/A (GQ/not a one-family house or mobil...</td>
<td>NaN</td>
</tr>
<tr>
<th>ADJHSG</th>
<td>7</td>
<td>Adjustment factor for housing dollar amounts (...</td>
<td>{'1000000': '2013 factor (1.000000)'}</td>
<td>[Note: The value of ADJHSG inflation-adjusts r...</td>
</tr>
<tr>
<th>ADJINC</th>
<td>7</td>
<td>Adjustment factor for income and earnings doll...</td>
<td>{'1007549': '2013 factor (1.007549)'}</td>
<td>[Note: The value of ADJINC inflation-adjusts r...</td>
</tr>
<tr>
<th>AGS</th>
<td>1</td>
<td>Sales of Agriculture Products (Yearly sales)</td>
<td>{'b': 'N/A (GQ/vacant/not a one family house o...</td>
<td>[Note: no adjustment factor is applied to AGS.]</td>
</tr>
</tbody>
</table>

Categories

Resources