Find intersection of values in dictionary under a same key - python - python

I have a file with lines, each line is split on "|", I want to compare arguments 5 from each line and if intersect, then proceed. This gets to a second part:
first arguments1,2 are compared by dictionary, and if they are same AND
if arguments5,6 are overlapping,
then those lines get concatenated.
How to compare intersection of values under the same key? The code below works cross-key but not within same key:
from functools import reduce
reduce(set.intersection, (set(val) for val in query_dict.values()))
Here is an example of lines:
text1|text2|text3|text4 text 5| text 6| text 7 text 8|
text1|text2|text12|text4 text 5| text 6| text 7|
text9|text10|text3|text4 text 5| text 11| text 12 text 8|
The output should be:
text1|text2|text3;text12|text4;text5;text4;text5|text6;text7 text8;text6
In other words, only those lines that are matching by 1st,2nd arguments (cells equal) and if 5th,6th arguments are overlapping (intersection) are concatenated.
Here is input file:
Angela Darvill|19036321|School of Nursing, University of Salford, Peel House Eccles, Manchester M30 0NN, UK.|['GB','US']|['Salford', 'Eccles', 'Manchester']
Helen Stanley|19036320|Senior Lecturer, Institute of Nursing and Midwifery, University of Brighton, Westlain House, Village Way, Falmer, BN1 9PH Brighton, UK.|['US']|['Brighton', 'Brighton']
Angela Darvill|190323121|School of Nursing, University of Salford, Peel House Eccles, Manchester M30 0NN, UK.|['US']|['Brighton', 'Eccles', 'Manchester']
Helen Stanley|19576876320|Senior Lecturer, Institute of Nursing and Midwifery, University of Brighton, Westlain House, Village Way, Falmer, BN1 9PH Brighton, UK.|['US']|['Brighton', 'Brighton']
The output should look like:
Angela Darvill|19036321;190323121|...
Helen Stanley|19036320;19576876320|...
Angela Darvill gets stacked because two records share same name, same country and same city(-ies).

Based on your improved question :
import itertools
data = """\
Angela Darvill|19036321|School of Nursing, University of Salford, Peel House Eccles, Manchester M30 0NN, UK.|['GB','US']|['Salford', 'Eccles', 'Manchester']
Helen Stanley|19036320|Senior Lecturer, Institute of Nursing and Midwifery, University of Brighton, Westlain House, Village Way, Falmer, BN1 9PH Brighton, UK.|['US']|['Brighton', 'Brighton']
Angela Darvill|190323121|School of Nursing, University of Salford, Peel House Eccles, Manchester M30 0NN, UK.|['US']|['Brighton', 'Eccles', 'Manchester']
Helen Stanley|19576876320|Senior Lecturer, Institute of Nursing and Midwifery, University of Brighton, Westlain House, Village Way, Falmer, BN1 9PH Brighton, UK.|['US']|['Brighton', 'Brighton']
"""
lines = tuple(tuple(line.split('|')) for line in data.splitlines())
results = []
for line_a_index, line_a in enumerate(lines):
# we want to compare each line with each other, so we start at index+1
for line_b_index, line_b in enumerate(lines[line_a_index+1:], start=line_a_index+1):
assert len(line_a) >= 5, f"not enough cells ({len(line_a)}) in line {line_a_index}"
assert len(line_b) >= 5, f"not enough cells ({len(line_b)}) in line {line_b_index}"
assert all(isinstance(cell, str) for cell in line_a)
assert all(isinstance(cell, str) for cell in line_b)
columns0_are_equal = line_a[0] == line_b[0]
columns1_are_equal = line_a[1] == line_b[1]
columns3_are_overlap = set(line_a[3]).issubset(set(line_b[3])) or set(line_b[3]).issubset(set(line_a[3]))
columns4_are_overlap = set(line_a[4]).issubset(set(line_b[4])) or set(line_b[4]).issubset(set(line_a[4]))
print(f"between lines index={line_a_index} and index={line_b_index}, {columns0_are_equal=} {columns1_are_equal=} {columns3_are_overlap=} {columns4_are_overlap=}")
if (
columns0_are_equal
# and columns1_are_equal
and (columns3_are_overlap or columns4_are_overlap)
):
print("MATCH!")
results.append(
(line_a_index, line_b_index,) + tuple(
((cell_a or "") + (";" if (cell_a or cell_b) else "") + (cell_b or "")) if cell_a != cell_b
else cell_a
for cell_a, cell_b in itertools.zip_longest(line_a, line_b)
)
)
print("Fancy output :")
lines_to_display = set(itertools.chain.from_iterable((lines[result[0]], lines[result[1]], result[2:]) for result in results))
columns_widths = (max(len(str(index)) for result in results for index in (result[0], result[1])),) + tuple(
max(len(cell) for cell in column)
for column in zip(*lines_to_display)
)
for width in columns_widths:
print("-" * width, end="|")
print("")
for result in results:
for line_index, original_line in zip((result[0], result[1]), (lines[result[0]], lines[result[1]])):
for column_index, cell in zip(itertools.count(), (str(line_index),) + original_line):
if cell:
print(cell.ljust(columns_widths[column_index]), end='|')
print("", end='\n') # explicit newline
for column_index, cell in zip(itertools.count(), ("=",) + result[2:]):
if cell:
print(cell.ljust(columns_widths[column_index]), end='|')
print("", end='\n') # explicit newline
for width in columns_widths:
print("-" * width, end="|")
print("")
expected_outputs = """\
Angela Darvill|19036321;190323121|...
Helen Stanley|19036320;19576876320|...
""".splitlines()
for result, expected_output in itertools.zip_longest(results, expected_outputs):
actual_output = "|".join(result[2:])
assert actual_output.startswith(expected_output[:-3]) # minus the "..."
-|--------------|--------------------|---------------------------------------------------------------------------------------------------------------------------------------|------------------|------------------------------------------------------------------------|
0|Angela Darvill|19036321 |School of Nursing, University of Salford, Peel House Eccles, Manchester M30 0NN, UK. |['GB','US'] |['Salford', 'Eccles', 'Manchester'] |
2|Angela Darvill|190323121 |School of Nursing, University of Salford, Peel House Eccles, Manchester M30 0NN, UK. |['US'] |['Brighton', 'Eccles', 'Manchester'] |
=|Angela Darvill|19036321;190323121 |School of Nursing, University of Salford, Peel House Eccles, Manchester M30 0NN, UK. |['GB','US'];['US']|['Salford', 'Eccles', 'Manchester'];['Brighton', 'Eccles', 'Manchester']|
1|Helen Stanley |19036320 |Senior Lecturer, Institute of Nursing and Midwifery, University of Brighton, Westlain House, Village Way, Falmer, BN1 9PH Brighton, UK.|['US'] |['Brighton', 'Brighton'] |
3|Helen Stanley |19576876320 |Senior Lecturer, Institute of Nursing and Midwifery, University of Brighton, Westlain House, Village Way, Falmer, BN1 9PH Brighton, UK.|['US'] |['Brighton', 'Brighton'] |
=|Helen Stanley |19036320;19576876320|Senior Lecturer, Institute of Nursing and Midwifery, University of Brighton, Westlain House, Village Way, Falmer, BN1 9PH Brighton, UK.|['US'] |['Brighton', 'Brighton'] |
-|--------------|--------------------|---------------------------------------------------------------------------------------------------------------------------------------|------------------|------------------------------------------------------------------------|
You can see that the lines index 0 and 2 have been merged, same for the lines index 1 and 3.

from itertools import zip_longest
data = """\
text1|text2|text3|text4 text 5| text 6| text 7 text 8|
text1|text2|text12|text4 text 5| text 6| text 7| text9|text10|text3|text4 text 5| text 11| text 12 text 8|
"""
lines = tuple(line.split('|') for line in data.splitlines())
number_of_lines = len(lines)
print(f"number of lines : {number_of_lines}")
print(f"number of cells in line 1 : {len(lines[0])}")
print(f"number of cells in line 2 : {len(lines[1])}")
print(f"{lines[0]=}")
print(f"{lines[1]=}")
result = []
# we want to compare each line with each other :
for line_a_index, line_a in enumerate(lines):
for line_b_index, line_b in enumerate(lines[line_a_index+1:]):
assert len(line_a) >= 5, f"not enough cells ({len(line_a)}) in line {line_a_index}"
assert len(line_b) >= 5, f"not enough cells ({len(line_b)}) in line {line_b_index}"
assert all(isinstance(cell, str) for cell in line_a)
assert all(isinstance(cell, str) for cell in line_b)
if line_a[0] == line_b[0] and line_a[1] == line_b[1] and (
line_a[5] in line_b[5] or line_a[6] in line_b[6] # A in B
or line_b[5] in line_a[5] or line_b[6] in line_a[6] # B in A
):
result.append(tuple(
((cell_a or "") + (";" if (cell_a or cell_b) else "") + (cell_b or "")) if cell_a != cell_b else cell_a
for cell_a, cell_b in zip_longest(line_a[:5+1], line_b[:5+1]) # <-- here I truncated the lines
))
# I decided to have a fancy output, but I made some simplifying assumptions to make it simple
if len(result) > 1:
raise NotImplementedError
widths = tuple(max(len(a) if a is not None else 0, len(b) if b is not None else 0, len(c) if c is not None else 0)
for a, b, c in zip_longest(lines[0], lines[1], result[0]))
length = max(len(lines[0]), len(lines[1]), len(result[0]))
for line in (lines[0], lines[1], result[0]):
for index, cell in zip_longest(range(length), line):
if cell:
print(cell.ljust(widths[index]), end='|')
print("", end='\n') # explicit newline
original_expected_output = "text1|text2|text3;text12|text4;text5;text4;text5|text6;text7 text8;text6"
print(f"{original_expected_output} <-- expected")
lenormju_expected_output = "text1|text2|text3;text12|text4 text 5| text 6| text 7 text 8; text 7"
print(f"{lenormju_expected_output} <-- fixed")
output
number of lines : 2
number of cells in line 1 : 7
number of cells in line 2 : 13
lines[0]=['text1', 'text2', 'text3', 'text4 text 5', ' text 6', ' text 7 text 8', '']
lines[1]=['text1', 'text2', 'text12', 'text4 text 5', ' text 6', ' text 7', ' text9', 'text10', 'text3', 'text4 text 5', ' text 11', ' text 12 text 8', '']
text1|text2|text3 |text4 text 5| text 6| text 7 text 8 |
text1|text2|text12 |text4 text 5| text 6| text 7 | text9|text10|text3|text4 text 5| text 11| text 12 text 8|
text1|text2|text3;text12|text4 text 5| text 6| text 7 text 8; text 7|
text1|text2|text3;text12|text4;text5;text4;text5|text6;text7 text8;text6 <-- expected
text1|text2|text3;text12|text4 text 5| text 6| text 7 text 8; text 7 <-- fixed
EDIT:
from dataclasses import dataclass
from itertools import zip_longest
data = """\
text1|text2|text3|text4 text 5| text 6| text 7 text 8|
text1|text2|text12|text4 text 5| text 6| text 7| text9|text10|text3|text4 text 5| text 11| text 12 text 8|
"""
#dataclass
class Match: # a convenient way to store the solutions
line_a_index: int
line_b_index: int
line_result: tuple
lines = tuple(line.split('|') for line in data.splitlines())
results = []
for line_a_index, line_a in enumerate(lines):
for line_b_index, line_b in enumerate(lines[line_a_index+1:], line_a_index+1):
assert len(line_a) >= 5, f"not enough cells ({len(line_a)}) in line {line_a_index}"
assert len(line_b) >= 5, f"not enough cells ({len(line_b)}) in line {line_b_index}"
assert all(isinstance(cell, str) for cell in line_a)
assert all(isinstance(cell, str) for cell in line_b)
if line_a[0] == line_b[0] and line_a[1] == line_b[1] and (
line_a[5] in line_b[5] or line_a[6] in line_b[6] # A in B
or line_b[5] in line_a[5] or line_b[6] in line_a[6] # B in A
):
line_result = tuple(
((cell_a or "") + (";" if (cell_a or cell_b) else "") + (cell_b or "")) if cell_a != cell_b else cell_a
for cell_a, cell_b in zip_longest(line_a[:5+1], line_b[:5+1]) # <-- here I truncated the lines
)
results.append(Match(line_a_index=line_a_index, line_b_index=line_b_index, line_result=line_result))
# simple output of the solution
for result in results:
print(f"line n°{result.line_a_index} matches with n°{result.line_b_index} : {result.line_result}")
line n°0 matches with n°1 : ('text1', 'text2', 'text3;text12', 'text4 text 5', ' text 6', ' text 7 text 8; text 7')

Related

Only get street and country from Libpostal (Pypostal) - PySpark

I'm using libpostal - pypostal to parse an address but I only need the road and the country in an Array ["franklin ave","usa"],["leonard st","united kingdom"]
How can I achieve this ?
Return type is net.razorvine.pickle.objects.classdictconstructor
from pyspark.sql.functions import udf
LIBPOSTAL_LOADED = False
#udf("string")
def parse(address):
from postal.parser import parse_address
address_parsed = parse_address(address)
return str(address_parsed)
spark.createDataFrame(['781 Franklin Ave Crown Heights Brooklyn NYC NY 11216 USA','The Book Club 100-106 Leonard St, Shoreditch, London, Greater London, EC2A 4RH, United Kingdom'], "string").toDF("address").select(parse("address")).show(truncate=False)
#MCK Updated on request
#udf("array<string>")
def parse(address):
from postal.parser import parse_address
address_parsed = [a[0] for a in parse_address(address) if a[1] in ['road', 'country']]
return address_parsed
+------------------+
|[franklin ave,usa]|
+------------------+
This is as expected
############################################################################
#udf("array<string>")
def parse(address):
from postal.parser import parse_address
address_parsed = [a[0] for a in parse_address(address) if a[1] in ['road', 'country']]
return address_parsed[0]
+-----+
|null |
+-----+
This is not as expected. I expect first element from address_parsed that is franklin ave
Maybe you can try a list comprehension before returning the parsed address:
#udf("array<string>")
def parse(address):
from postal.parser import parse_address
address_parsed = [a[0] for a in parse_address(address) if a[1] in ['road', 'country']]
return address_parsed

Different Behavior for Parsing Two Similar Wikipedia Infoboxes

I have two infoboxes that look exactly the same to me, but I'm getting different behavior in mwparserfromhell. In the first instance I'm getting what I expect - the entire infobox is captured as a template object. In the second instance parts of the infobox are extracted as separate templates. This is confusing since the infoboxes look very similar to me, and I was expecting that the entire infobox could be extracted in the second case.
This is the code I'm using:
mwparserfromhell.parse(text.strip().lower()).filter_templates()
Text 1 Input:
txt1 = """{{Infobox building
| name = 666 Fifth Avenue
| former_names = Tishman Building
| status = Complete
| image = 666 Fifth Avenue by David Shankbone.jpg
| image_size = 300px
| caption =
| location = 666 Fifth Avenue<br>[[Manhattan]], [[New York (state)|New York]] 10103
| coordinates = {{coord|40.760163|-73.976204|format=dms}}
| start_date =
| completion_date = 1957
| architect = [[Carson & Lundin]]
| owner = [[Brookfield Properties]]
| cost = $40 million
| floor_area = {{convert|1,463,892|sqft|m2|abbr=on}}
| top_floor =
| floor_count = 41
| references =
| map_type =
| building_type = Office
| antenna_spire =
| roof = {{convert|483|ft|m|abbr=on}}
| elevator_count = 24 (20 passenger, 4 freight)
| structural_engineer =
| main_contractor =
| opening = November 25, 1957
| developer = Tishman Realty and Construction
| management =
}}"""
Text 1 Output:
['{{infobox building\n| name = 666 fifth avenue\n| former_names = tishman building\n| status = complete\n| image = 666 fifth avenue by david shankbone.jpg\n| image_size = 300px\n| caption = \n| location = 666 fifth avenue<br>[[manhattan]], [[new york (state)|new york]] 10103\n| coordinates = {{coord|40.760163|-73.976204|format=dms}}\n| start_date = \n| completion_date = 1957\n| architect = [[carson & lundin]]\n| owner = [[brookfield properties]]\n| cost = $40 million\n| floor_area = {{convert|1,463,892|sqft|m2|abbr=on}}\n| top_floor = \n| floor_count = 41\n| references = \n| map_type = \n| building_type = office\n| antenna_spire = \n| roof = {{convert|483|ft|m|abbr=on}}\n| elevator_count = 24 (20 passenger, 4 freight)\n| structural_engineer = \n| main_contractor = \n| opening = november 25, 1957\n| developer = tishman realty and construction\n| management = \n}}',
'{{coord|40.760163|-73.976204|format=dms}}',
'{{convert|1,463,892|sqft|m2|abbr=on}}',
'{{convert|483|ft|m|abbr=on}}']
Text 2 Input:
txt2 = """{{Infobox building
| name = Central Park Tower
| alternate_names = Nordstrom Tower
| image = Central Park Tower April 2020.jpg
| caption = Central Park Tower on April 25, 2020
| location = 225 [[57th Street (Manhattan)|West 57th Street]]<br/>[[Manhattan]], [[New York City]], [[New York (state)|New York]], [[United States|U.S.]]
| coordinates = {{coord|40.7663|-73.9810|type:landmark_globe:earth_region:US-NY|display=inline,title}}
| status = Topped Out
| start_date = 2014
| est_completion = 2020<ref name=curbed>{{cite news |author=Amy Plitt |url=https://ny.curbed.com/2017/6/1/15714666/central-park-tower-offering-plan-approval-sales-launch |title=Central Park Tower is now one step closer to launching sales |date=June 1, 2017 |access-date=August 30, 2017 |work=Curbed}}</ref>
| building_type = [[Residential]], [[retail]]
| architectural_style = [[Modern architecture|Modern]]
| architectural = {{cvt|1550|ft|0}}
| floor_count = 131<ref>{{cite web |url=https://www.architecturaldigest.com/story/new-york-city-central-park-tower-worlds-tallest-residential-building </ref><ref>{{cite web |url=https://archpaper.com/2019/09/central-park-tower-tops-out/</ref> (98 habitable floors)<ref name="auto">{{Cite web |url=http://www.skyscrapercenter.com/building/central-park-tower/14269 |title=Central Park Tower - The Skyscraper Center |website=www.skyscrapercenter.com |access-date=October 10, 2018}}</ref>
| elevator_count = 11
| cost = $3 billion<ref name="Tase">{{cite news|url=https://commercialobserver.com/2019/04/all-in-good-tase-the-crisis-for-the-american-cohort-in-tel-aviv-is-essentially-over/|title=All in Good TASE: The Crisis for the American Cohort in Tel Aviv Is Essentially Over|date=April 4, 2019|work=Commercial Observer|last=Gourarie|first=Chava}}</ref>
| floor_area = {{convert|1,285,308|sqft|m2}}<ref name="auto" />
| architect = [[Adrian Smith + Gordon Gill Architecture]]
| structural_engineer = [[WSP Global]]
| main_contractor = [[Lendlease]]
| developer = [[Extell Development Company]]
}}"""
Text 2 Output:
['{{coord|40.7663|-73.9810|type:landmark_globe:earth_region:us-ny|display=inline,title}}',
'{{cite news |author=amy plitt |url=https://ny.curbed.com/2017/6/1/15714666/central-park-tower-offering-plan-approval-sales-launch |title=central park tower is now one step closer to launching sales |date=june 1, 2017 |access-date=august 30, 2017 |work=curbed}}',
'{{cvt|1550|ft|0}}',
'{{cite web |url=https://archpaper.com/2019/09/central-park-tower-tops-out/</ref> (98 habitable floors)<ref name="auto">{{cite web |url=http://www.skyscrapercenter.com/building/central-park-tower/14269 |title=central park tower - the skyscraper center |website=www.skyscrapercenter.com |access-date=october 10, 2018}}</ref>\n| elevator_count = 11\n| cost = $3 billion<ref name="tase">{{cite news|url=https://commercialobserver.com/2019/04/all-in-good-tase-the-crisis-for-the-american-cohort-in-tel-aviv-is-essentially-over/|title=all in good tase: the crisis for the american cohort in tel aviv is essentially over|date=april 4, 2019|work=commercial observer|last=gourarie|first=chava}}</ref>\n| floor_area = {{convert|1,285,308|sqft|m2}}<ref name="auto" />\n| architect = [[adrian smith + gordon gill architecture]]\n| structural_engineer = [[wsp global]]\n| main_contractor = [[lendlease]]\n| developer = [[extell development company]]\n}}',
'{{cite web |url=http://www.skyscrapercenter.com/building/central-park-tower/14269 |title=central park tower - the skyscraper center |website=www.skyscrapercenter.com |access-date=october 10, 2018}}',
'{{cite news|url=https://commercialobserver.com/2019/04/all-in-good-tase-the-crisis-for-the-american-cohort-in-tel-aviv-is-essentially-over/|title=all in good tase: the crisis for the american cohort in tel aviv is essentially over|date=april 4, 2019|work=commercial observer|last=gourarie|first=chava}}',
'{{convert|1,285,308|sqft|m2}}']
This is a known bug for mwparserfromhell. My workaround was to create an on-the-fly regex pattern that would remove the ref link but keep the rest of the text intact.
{{ this is text <ref>this is a ref link}}</ref>
to
{{ this is text }}
Here's the code:
def get_regex_str_pattern(reg_str):
""" Creates regex string to remove specific patterns that are embedded in larger strings
:param reg_str: String to tokenize
:return: Regex pattern
"""
return r'.*'.join([f"({re.escape(r.strip())})" for r in reg_str.split()])
def get_ref_clean_str(txt):
""" Removes badly formed ref strings from wiki text
:param txt: Wiki text
:return: Parser object
"""
clean_txt = txt
wiki_text = mwparserfromhell.parse(txt)
for r in wiki_text.filter_tags():
if str(r.tag) in ('ref', 'br'):
clean_txt = re.sub(get_regex_str_pattern(r), ' ', clean_txt)
return mwparserfromhell.parse(clean_txt)

Return first word of the string

I has a data like below:
Colindale London
London Borough of Bromley
Crystal Palace, London
Bermondsey, London
Camden, London
This is my code:
def clean_whitespace(s):
out = str(s).replace(' ', '')
return out.lower()
My code now just return the string that has been remove white space. How can I select the first word the the string. For example:
Crystal Palace, London -> crystal-palace
Bermondsey, London -> bermondsey
Camden, London -> camden
You can try this code:
s = 'Bermondsey, London'
def clean_whitespace(s):
out = str(s).split(',', 1)[0]
out = out.strip()
out = out.replace(' ', '-')
return out.lower()
print(clean_whitespace(s))
Output:
bermondsey
Try this below :
s = "Crystal Palace, London"
output = s.split(',')[0].replace(' ', '-').lower()
print(output)

Text to PDF Positioning Lines

I have a text file that i am reading and writing line by line into a PDF. The lines are out of position on the PDF because the FPDF library is left aligning all my lines. I am using the property set x so i can position each line to my liking. I am trying to reposition the headers until "RATE CODE CY" the would like all the data under the columns to come after. Then another header appears. I would like to align all the headers that come after the data. I know a for loop needs to be done to bring rest of the data...the issue is a header will come again and there is where i have to make the change with set_x property.
pdf = FPDF("L", "mm", "A4")
pdf.add_page()
pdf.set_font('arial', style='', size=10.0)
lines = file.readlines()
header8 = lines[7]
header8_1 = " ".join(lines[8].split()[:4])
header8_2 = " ".join(lines[8].split()[4:])
header9_1 = " ".join(lines[9].split()[:5])
header9_2 = " ".join(lines[9].split()[5:])
pdf.cell(ln=0, h=5.0, align='L', w=0, txt=header8_1, border=0)
pdf.set_x(125)
pdf.cell(ln=1, h=5.0, align='L', w=0, txt=header8_2, border=0)
pdf.cell(ln=0, h=5.0, align='L', w=0, txt=header9_1, border=0)
pdf.set_x(125)
pdfcell(ln=1, h=5.0, align='L', w=0, txt=header9_2, border=0)
Current PDF file:
READ SVC B MAXIMUM TOTAL DUE METER NO REMARKS
ACCOUNT # SERVICE ADDRESS CITY DATE DAY C KWH KWD AMOUNT
RATE CODE CY CUSTOMER NAME MAILING ADDRESS
----------------------------------------------------------------------------------------------------
11211-22222 12345 TEST HWY #86 TITUSVIL 10/12/19 29 C 1,444 189.01 ABC1234
GS-1 3 Home & ASSOC INC 1234 Miami HWY APT49
22222-33333 12345 TEST HWY #88 TITUSVIL 10/04/19 29 C 256 41.50 ABC1235
GS-1 3 DGN & ASSOC INC 1234 Miami HWY APT49
READ SVC B MAXIMUM TOTAL DUE METER NO REMARKS
ACCOUNT # SERVICE ADDRESS CITY DATE DAY C KWH KWD AMOUNT
RATE CODE CY CUSTOMER NAME MAILING ADDRESS
----------------------------------------------------------------------------------------------------
11211-22222 12345 TEST HWY #86 TITUSVIL 10/12/19 29 C 1,444 189.01 ABC1234
GS-1 3 Home & ASSOC INC 1234 Miami HWY APT49
22222-33333 12345 TEST HWY #88 TITUSVIL 10/04/19 29 C 256 41.50 ABC1235
GS-1 3 DGN & ASSOC INC 1234 Miami HWY APT49

Search and Find through a list Python

I have a main text file that looks like this:
STATUS| CRN| SUBJECT| SECT| COURSE| CREDIT| INSTR.| BLDG/RM| DAY/TIME| FROM / TO|
OPEN| 43565| ACA6202| 10| Acting II| 3.00| Logan, G| SEE DEPT| | 01/12/15 - 04/27/15|
OPEN| 43566| ACA6206| 10| Topics:Classical Drama/Cult II| 2.00| Jacobson, L| SEE DEPT| | 01/12/15 - 04/27/15|
OPEN| 43567| ACA6210| 10| Text II| 2.00| Logan, G| SEE DEPT| | 01/12/15 - 04/27/15|
OPEN| 43568| ACA6212| 10| Voice and Speech II| 3.00| Logan, G| SEE DEPT| | 01/12/15 - 04/27/15|
OPEN| 43569| ACA6216| 10| Movement II| 2.00| Logan, G| SEE DEPT| | 01/12/15 - 04/27/15|
OPEN| 43570| ACA6220| 10| Alexander Technique II| 2.00| Logan, G| SEE DEPT| | 01/12/15 - 04/27/15|
OPEN| 43571| ACA6224| 10| Stage Combat II| 2.00| Logan, G| SEE DEPT| | 01/12/15 - 04/27/15|
OPEN| 43572| ACA6228| 10| Practicum IV| 3.00| Logan, G| SEE DEPT| | 01/12/15 - 04/27/15|
OPEN| 44500| ACA6595| 10| Selected Topics| 1.00| Logan, G| SEE DEPT| | 01/12/15 - 04/27/15|
My code below gathers only the "SUBJECT" column and strips the numbers from the string. So for example, the output from the top of the file would print several "ACA"s.
with open ("/Users/it/Desktop/Classbook/classAbrevs.txt", "r") as myfile:
subsAndAbrevsMap = tuple(open("/Users/it/Desktop/Classbook/classAbrevs.txt", 'r'))
with open ("/Users/it/Desktop/Classbook/masterClassList.txt", "r") as myfile:
masterSchedule = tuple(open("/Users/it/Desktop/Classbook/masterClassList.txt", 'r'))
for masterline in masterSchedule:
masterline.strip()
masterSplitLine = masterline.split("|")
if masterSplitLine[0] != "STATUS":
subjectAbrev = ''.join([i for i in masterSplitLine[2] if not i.isdigit()])
I have another .txt file that looks like this:
Academy for Classical Acting,ACA
Accountancy,ACCY
Africana Studies,AFST
American Studies,AMST
Anatomy & Regenerative Biology,ANAT
Anthropology,ANTH
Applied Science,APSC
Arabic,ARAB
Art/Art History,AH
Art/Fine Arts,FA
Astronomy,ASTR
Biochemistry,BIOC
Biological Sciences,BISC
In my code below, I check to see if the abbreviations(column 2) in my second .txt equal the abbreviations generated from my first .txt document. If it is a match I would like to append the full class name:
#open 2nd .txt, strip and split
for subsline in subsAndAbrevsMap:
subsline.strip()
subLineSplit = subsline.split(",")
print "subLineSplit is: " + subsline[0]
if subLineSplit[1] == subjectAbrev:
realSubjectName = subLineSplit[0]
print "The subject name for abrev " + subjectAbrev + " is " + realSubjectName
I want the output to print:
"The subject name for abrev ACA is Academy for Classical Acting"
What am I doing wrong?
First of all, these are csv files, so use your csv module!
# path to first file is ~/classes.csv
# path to second file is ~/abbr.csv
import csv
with open("~/classes.csv", 'rU') as classes_csv,\
open("~/abbr.csv", 'rU') as abbr_csv:
classes = csv.reader(classes_csv, delimiter='|')
abbr = csv.reader(abbr_csv, delimiter=',')
header = next(classes)
abbr_dict = {line[1].strip():line[0].strip() for line in abbr}
# create a lookup dictionary for your tags -> names
class_tags = (line[2].strip("0123456789 ") for line in classes)
# create a genexp for all the extant tags in ~/classes.csv
result = {tag:abbr_dict[tag] for tag in class_tags if tag in abbr_dict}
Then it should be easy to format your result.
for abbr,cls in result.items():
print("The abbreviation for {} is {}".format(cls,abbr))

Categories

Resources