Reading statistics from a .txt file and outputting them

Reading statistics from a .txt file and outputting them - python

I am supposed to get certain information from a .txt file and output it. This is the information I need:
State with the maximum population
State with the minimum population
Average state population
State of Texas population
The DATA looks like:
Alabama
AL
4802982
Alaska
AK
721523
Arizona
AZ
6412700
Arkansas
AR
2926229
California
CA
37341989
This is my code that does not really do anything I need it to do:
def main():
# Open the StateCensus2010.txt file.
census_file = open('StateCensus2010.txt', 'r')
# Read the state name
state_name = census_file.readline()
while state_name != '':
state_abv = census_file.readline()
population = int(census_file.readline())
state_name = state_name.rstrip('\n')
state_abv = state_abv.rstrip('\n')
print('State Name: ', state_name)
print('State Abv.: ', state_abv)
print('Population: ', population)
print()
state_name = census_file.readline()
census_file.close()
main()
All I have it doing is reading the state name, abv and converting the population into an int. I don't need it to do anything of that, however I'm unsure how to do what the assignment is asking. Any hints would definitely be appreciated! I've been trying some things for the past few hours to no avail.
Update:
This is my updated code however I'm receving the following error:
Traceback (most recent call last):
File "main.py", line 13, in <module>
if population > max_population:
TypeError: unorderable types: str() > int()
Code:
with open('StateCensus2010.txt', 'r') as census_file:
while True:
try:
state_name = census_file.readline()
state_abv = census_file.readline()
population = int(census_file.readline())
except IOError:
break
# data processing here
max_population = 0
for population in census_file:
if population > max_population:
max_population = population
print(max_population)

As the data is in consistent order; Statename, State Abv, Population. So you just need to read the lines one time, and display all three 3 information. Below is the sample code.
average = 0.0
total = 0.0
state_min = 999999999999
state_max = 0
statename_min = ''
statename_max = ''
texas_population = 0
with open('StateCensus2010.txt','r') as file:
# split new line, '\n' here means newline
data = file.read().split('\n')
# get the length of the data by using len() method
# there are 50 states in the text file
# each states have 3 information stored,
# state name, state abreviation, population
# that's why length of data which is 150/3 = 50 states
state_total = len(data)/3
# this count is used as an index for the list
count = 0
for i in range(int(state_total)):
statename = data[count]
state_abv = data[count+1]
population = int(data[count+2])
print('Statename : ',statename)
print('State Abv : ',state_abv)
print('Population: ',population)
print()
# sum all states population
total += population
if population > state_max:
state_max = population
statename_max = statename
if population < state_min:
state_min = population
statename_min = statename
if statename == 'Texas':
texas_population = population
# add 3 because we want to jump to next state
# for example the first three lines is Alabama info
# the next three lines is Alaska info and so on
count += 3
# divide the total population with number of states
average = total/state_total
print(str(average))
print('Lowest population state :', statename_min)
print('Highest population state :', statename_max)
print('Texas population :', texas_population)

This problem is pretty easy using pandas.
Code:
states = []
for line in data:
states.append(
dict(state=line.strip(),
abbrev=next(data).strip(),
pop=int(next(data)),
)
)
df = pd.DataFrame(states)
print(df)
print('\nmax population:\n', df.ix[df['pop'].idxmax()])
print('\nmin population:\n', df.ix[df['pop'].idxmin()])
print('\navg population:\n', df['pop'].mean())
print('\nAZ population:\n', df[df.abbrev == 'AZ'])
Test Data:
from io import StringIO
data = StringIO(u'\n'.join([x.strip() for x in """
Alabama
AL
4802982
Alaska
AK
721523
Arizona
AZ
6412700
Arkansas
AR
2926229
California
CA
37341989
""".split('\n')[1:-1]]))
Results:
abbrev pop state
0 AL 4802982 Alabama
1 AK 721523 Alaska
2 AZ 6412700 Arizona
3 AR 2926229 Arkansas
4 CA 37341989 California
max population:
abbrev CA
pop 37341989
state California
Name: 4, dtype: object
min population:
abbrev AK
pop 721523
state Alaska
Name: 1, dtype: object
avg population:
10441084.6
AZ population:
abbrev pop state
2 AZ 6412700 Arizona

Another pandas solution, from the interpreter:
>>> import pandas as pd
>>>
>>> records = [line.strip() for line in open('./your.txt', 'r')]
>>>
>>> df = pd.DataFrame([records[i:i+3] for i in range(0, len(records), 3)],
... columns=['State', 'Code', 'Pop']).dropna()
>>>
>>> df['Pop'] = df['Pop'].astype(int)
>>>
>>> df
State Code Pop
0 Alabama AL 4802982
1 Alaska AK 721523
2 Arizona AZ 6412700
3 Arkansas AR 2926229
4 California CA 37341989
>>>
>>> df.ix[df['Pop'].idxmax()]
State California
Code CA
Pop 37341989
Name: 4, dtype: object
>>>
>>> df.ix[df['Pop'].idxmin()]
State Alaska
Code AK
Pop 721523
Name: 1, dtype: object
>>>
>>> df['Pop'].mean()
10441084.6
>>>
>>> df.ix[df['Code'] == 'AZ' ]
State Code Pop
2 Arizona AZ 6412700

Please try this the earlier code was not python 3 compatible. It supported python 2.7
def extract_data(state):
total_population = 0
for states, stats in state.items():
population = stats.get('population')
state_name = stats.get('state_name')
states = states
total_population = population + total_population
if 'highest' not in vars():
highest = population
higherst_state_name = state_name
highest_state = states
if 'lowest' not in vars():
lowest = population
lowest_state_name = state_name
lowest_state = states
if highest < population:
highest = population
higherst_state_name = state_name
highest_state = states
if lowest > population:
lowest = population
lowest_state_name = state_name
lowest_state = states
print(highest_state, highest)
print(lowest_state, lowest)
print(len(state))
print(int(total_population/len(state)))
print(state.get('TX').get('population'))
def main():
# Open the StateCensus2010.txt file.
census_file = open('states.txt', 'r')
# Read the state name
state_name = census_file.readline()
state = {}
while state_name != '':
state_abv = census_file.readline()
population = int(census_file.readline())
state_name = state_name.rstrip('\n')
state_abv = state_abv.rstrip('\n')
if state_abv in state:
state[state_abv].update({'population': population, 'state_name': state_name})
else:
state.setdefault(state_abv,{'population': population, 'state_name': state_name})
state_name = census_file.readline()
census_file.close()
return state
state=main()
extract_data(state)

Related

Python Regex Capturing Multiple Matches in separate observations

I am trying to create variables location; contract items; contract code; federal aid using regex on the following text:
PAGE 1
BID OPENING DATE 07/25/18 FROM 0.2 MILES WEST OF ICE HOUSE 07/26/18 CONTRACT NUMBER 03-2F1304 ROAD TO 0.015 MILES WEST OF CONTRACT CODE 'A '
LOCATION 03-ED-50-39.5/48.7 DIVISION HIGHWAY ROAD 44 CONTRACT ITEMS
INSTALL SANDTRAPS AND PULLOUTS FEDERAL AID ACNH-P050-(146)E
PAGE 1
BID OPENING DATE 07/25/18 IN EL DORADO COUNTY AT VARIOUS 07/26/18 CONTRACT NUMBER 03-2H6804 LOCATIONS ALONG ROUTES 49 AND 193 CONTRACT CODE 'C ' LOCATION 03-ED-0999-VAR 13 CONTRACT ITEMS
TREE REMOVAL FEDERAL AID NONE
PAGE 1
BID OPENING DATE 07/25/18 IN LOS ANGELES, INGLEWOOD AND 07/26/18 CONTRACT NUMBER 07-296304 CULVER CITY, FROM I-105 TO PORT CONTRACT CODE 'B '
LOCATION 07-LA-405-R21.5/26.3 ROAD UNDERCROSSING 55 CONTRACT ITEMS
ROADWAY SAFETY IMPROVEMENT FEDERAL AID ACIM-405-3(056)E
This text is from one word file; I'll be looping my code on multiple doc files. In the text above are three location; contract items; contract code; federal aid pairs. But when I use regex to create variables, only the first instance of each pair is included.
The code I have right now is:
# imports
import os
import pandas as pd
import re
import docx2txt
import textract
import antiword
all_bod = []
all_cn = []
all_location = []
all_fedaid = []
all_contractcode = []
all_contractitems = []
all_file = []
text = ' PAGE 1
BID OPENING DATE 07/25/18 FROM 0.2 MILES WEST OF ICE HOUSE 07/26/18 CONTRACT NUMBER 03-2F1304 ROAD TO 0.015 MILES WEST OF CONTRACT CODE 'A '
LOCATION 03-ED-50-39.5/48.7 DIVISION HIGHWAY ROAD 44 CONTRACT ITEMS
INSTALL SANDTRAPS AND PULLOUTS FEDERAL AID ACNH-P050-(146)E
PAGE 1
BID OPENING DATE 07/25/18 IN EL DORADO COUNTY AT VARIOUS 07/26/18 CONTRACT NUMBER 03-2H6804 LOCATIONS ALONG ROUTES 49 AND 193 CONTRACT CODE 'C ' LOCATION 03-ED-0999-VAR 13 CONTRACT ITEMS
TREE REMOVAL FEDERAL AID NONE
PAGE 1
BID OPENING DATE 07/25/18 IN LOS ANGELES, INGLEWOOD AND 07/26/18 CONTRACT NUMBER 07-296304 CULVER CITY, FROM I-105 TO PORT CONTRACT CODE 'B '
LOCATION 07-LA-405-R21.5/26.3 ROAD UNDERCROSSING 55 CONTRACT ITEMS
ROADWAY SAFETY IMPROVEMENT FEDERAL AID ACIM-405-3(056)E'
bod1 = re.search('BID OPENING DATE \s+ (\d+\/\d+\/\d+)', text)
bod2 = re.search('BID OPENING DATE\n\n(\d+\/\d+\/\d+)', text)
if not(bod1 is None):
bod = bod1.group(1)
elif not(bod2 is None):
bod = bod2.group(1)
else:
bod = 'NA'
all_bod.append(bod)
# creating contract number
cn1 = re.search('CONTRACT NUMBER\n+(.*)', text)
cn2 = re.search('CONTRACT NUMBER\s+(.........)', text)
if not(cn1 is None):
cn = cn1.group(1)
elif not(cn2 is None):
cn = cn2.group(1)
else:
cn = 'NA'
all_cn.append(cn)
# location
location1 = re.search('LOCATION \s+\S+', text)
location2 = re.search('LOCATION \n+\S+', text)
if not(location1 is None):
location = location1.group(0)
elif not(location2 is None):
location = location2.group(0)
else:
location = 'NA'
all_location.append(location)
# federal aid
fedaid = re.search('FEDERAL AID\s+\S+', text)
fedaid = fedaid.group(0)
all_fedaid.append(fedaid)
# contract code
contractcode = re.search('CONTRACT CODE\s+\S+', text)
contractcode = contractcode.group(0)
all_contractcode.append(contractcode)
# contract items
contractitems = re.search('\d+ CONTRACT ITEMS', text)
contractitems = contractitems.group(0)
all_contractitems.append(contractitems)
This code parses the only first instance of these variables in the text.
contract-number
location
contract-items
contract-code
federal-aid
03-2F1304
03-ED-50-39.5/48.7
44
A
ACNH-P050-(146)E
But, I am trying to figure out a way to get all possible instances in different observations.
contract-number
location
contract-items
contract-code
federal-aid
03-2F1304
03-ED-50-39.5/48.7
44
A
ACNH-P050-(146)E
03-2H6804
03-ED-0999-VAR
13
C
NONE
07-296304
07-LA-405-R21.5/26.3
55
B
ACIM-405-3(056)E
The all_variables in the code are for looping over multiple word files - we can ignore that if we want :).
Any leads would be super helpful. Thanks so much!

import re
data = []
df = pd.DataFrame()
regex_contract_number =r"(?:CONTRACT NUMBER\s+(?P<contract_number>\S+?)\s)"
regex_location = r"(?:LOCATION\s+(?P<location>\S+))"
regex_contract_items = r"(?:(?P<contract_items>\d+)\sCONTRACT ITEMS)"
regex_federal_aid =r"(?:FEDERAL AID\s+(?P<federal_aid>\S+?)\s)"
regex_contract_code =r"(?:CONTRACT CODE\s+\'(?P<contract_code>\S+?)\s)"
regexes = [regex_contract_number,regex_location,regex_contract_items,regex_federal_aid,regex_contract_code]
for regex in regexes:
for match in re.finditer(regex, text):
data.append(match.groupdict())
df = pd.concat([df, pd.DataFrame(data)], axis=1)
data = []
df

Continued Difficulties with Python Question

I asked this question earlier and am still having difficulties. I tried a new approach that isn't working. Essentially, I'm trying to implement a program that performs a calculation using Python objects to represent data. I want to determine the name of the county that had the highest voter turnout in a previous election, as well as the percentage of the population who voted. I need to use two function names, but can manipulate them however I see fit. Here's what I currently have and not sure what mistake I'm making here:
#creating a dictionary to store the country name and its percentage
data = {}
#creating the class county
class County:
def __init__(self,county,population,voters):
self.country = country
self.voters = voters
self.population = population
self.sorted_data = ""
self.formatted_percentage = ""
def highest_turnout(data) :
highest = data[0]
highest_percent = (data[0].voters / data[0].population)
for data in County
if (County.voters / County.population) > highest_percent
highest = County
highest_percent = County.data
allegheny = County("allegheny", 1000490, 645469)
philadelphia = County("philadelphia", 1134081, 539069)
montgomery = County("montgomery", 568952, 399591)
lancaster = County("lancaster", 345367, 230278)
delaware = County("delaware", 414031, 284538)
chester = County("chester", 319919, 230823)
bucks = County("bucks", 444149, 319816)
I need the “highest_turnout” function to do this:
Find the County that has the highest turnout, i.e. the highest percentage of the
population who voted, using the objects’ population and voters attributes
Return a tuple containing the name of the County with the highest turnout and the
percentage of the population who voted, in that order; the percentage should be
represented as a number between 0 and 1
Display the results of any “print” functions, as well as the last one which prints the return value of the function. Note that your highest_turnout function should correctly determine the County with the highest turnout for any input list
Any explanations / advice on how to approach this would be greatly appreciated. Thank you as I'm pretty new to Python and want to learn as much as possible.

#creating a list to store the country name and its percentage
data = []
#creating the class county
class County:
def __init__(self,county,population,voters):
self.county = county
self.voters = voters
self.population = population
self.sorted_data = ""
self.formatted_percentage = ""
def highest_turnout(data) :
sorted_data_by_turnout = sorted(data, key=lambda county: county.voters / county.population, reverse=True)
highest_turnout_county = sorted_data_by_turnout[0]
return highest_turnout_county.county, (highest_turnout_county.voters / highest_turnout_county.population)
data = []
data.append(County("allegheny", 1000490, 645469))
data.append(County("philadelphia", 1134081, 539069))
data.append(County("montgomery", 568952, 399591))
data.append(County("lancaster", 345367, 230278))
data.append(County("delaware", 414031, 284538))
data.append(County("chester", 319919, 230823))
data.append(County("bucks", 444149, 319816))
print(highest_turnout(data))
FYI: there were some indentation errors in your code

How can i split string?

I'm trying to split string into address, city, state and zip code but unable to split successfully.
Here is my code:
address = "4502 150th Pl SE, Bellevue, WA 98006"
my_add = address.split(',')
street = my_add[0]
city = my_add[1]
state_zip = my_add[2]
state_zip = state_zip
state = state_zip.split(' ')
print(street)
print(city)
print(state_zip)
print(state)
# 4502 150th Pl SE
# Bellevue
# WA 98006
# ['', 'WA', '98006']
I expect that address will be split as:
address : 4502 150th Pl SE
city : Bellevue
state: WA
zipcode: 98006
can anyone help me to find best possible solution. Thanks

If you are sure that a comma is always followed by a space, you can do this:
address = "4502 150th Pl SE, Bellevue, WA 98006"
street, city, state_info = address.split(", ")
state, zipcode = state_info.split(" ")
print("address:", street)
print("city:", city)
print("state:", state)
print("zipcode:", zipcode)

You are getting some extra spaces in there, and since you are splitting on spaces, you end up with my_add[2] containing three elements: an empty string (comes before the first space), your state, and your zip code. You can add .strip() to your code to fix this:
street = my_add[0].strip()
city = my_add[1].strip()
state_zip = my_add[2].strip() # remove extra spaces
state_zip = state_zip.split(' ') # now split on space to get state and zip
state = state_zip[0] # first element: state
zip_code = state_zip[1] # second element: zip
print(street)
print(city)
print(state_zip)
print(state)
print(zip_code)
# 4502 150th Pl SE
# Bellevue
# ['WA', '98006']
# WA
# 98006

I think your solution would be the code below:
address = "4502 150th Pl SE, Bellevue, WA 98006"
my_add = address.split(',')
street = my_add[0]
city = my_add[1]
state_zip = my_add[2]
state_zip_split = state_zip.split(' ')
state_zip = state_zip_split[2]
state = state_zip_split[1]
print("Street: ", street)
print("City: ", city)
print("State Zip: ", state_zip)
print("State: ", state)
You defined state_zip as an array, you needed to split it one more time to get the state and zip code

You can try this.
>>> address = "4502 150th Pl SE, Bellevue, WA 98006"
>>> my_add = address.split(',')
>>> street = my_add[0]
>>> street
'4502 150th Pl SE'
>>> city = my_add[1].strip()
>>> city
'Bellevue'
>>> state_zip = my_add[2].split()[1]
>>> state_zip
'98006'
>>> state = my_add[2].split()[0]
>>> state
'WA'
Hope it helps.

One way to solve this
import re
re.split(', ', address)
*add1, city, state, zipcode = [x for x in re.split('[ ,]', address) if x!='']
add1 = ' '.join(add1)

Iterating over set of lists to find highest average in Python

How to create function which iterates over each county, calculating voter turnout percentage?
class County:
def __init__(self, init_name, init_population, init_voters) :
self.name = init_name
self.population = init_population
self.voters = init_voters
def highest_turnout(data) :
100 * (self.voters / self.population)
allegheny = County("allegheny", 1000490, 645469)
philadelphia = County("philadelphia", 1134081, 539069)
montgomery = County("montgomery", 568952, 399591)
lancaster = County("lancaster", 345367, 230278)
delaware = County("delaware", 414031, 284538)
chester = County("chester", 319919, 230823)
bucks = County("bucks", 444149, 319816)
data = [allegheny, philadelphia, montgomery, lancaster, delaware, chester, bucks]

Your class County is defined correctly.
However, function county is not correct.
When passing data in the function highest_turnout, you have to first calculate the percentage of voters in the first County of the list - it is positioned at data[0].
Then we set “highest” to be the country name of the 1st County, we assume that the 1st in the data list is the highest one we have seen.
Next, we use a for loop to start iterating over all the County objects in the list data in order to pass in each County object.
The variable pct gives us the percentage of voters in the County that is running in the current step. The if function compares it to the highest percentage stored in the variable pct. If the new percentage is higher than pct (returns True), we update the highest percentage variable pct and hence update the county name.
def highest_turnout(data) :
highest_pct = data[0].voters / data[0].population
highest = data[0].name
for county in data :
pct = county.voters / county.population
if pct > highest_pct :
highest_pct = pct
highest = county.name

Data Analysis using Python

I have 2 CSV files. One with city name, population and humidity. In second cities are mapped to states. I want to get state-wise total population and average humidity. Can someone help? Here is the example:
CSV 1:
CityName,population,humidity
Austin,1000,20
Sanjose,2200,10
Sacramento,500,5
CSV 2:
State,city name
Ca,Sanjose
Ca,Sacramento
Texas,Austin
Would like to get output(sum population and average humidity for state):
Ca,2700,7.5
Texas,1000,20

The above solution doesn't work because dictionary will contain one one key value. i gave up and finally used a loop. below code is working, mentioned input too
csv1
state_name,city_name
CA,sacramento
utah,saltlake
CA,san jose
Utah,provo
CA,sanfrancisco
TX,austin
TX,dallas
OR,portland
CSV2
city_name population humidity
sacramento 1000 1
saltlake 300 5
san jose 500 2
provo 100 7
sanfrancisco 700 3
austin 2000 4
dallas 2500 5
portland 300 6
def mapping_within_dataframe(self, file1,file2,file3):
self.csv1 = file1
self.csv2 = file2
self.outcsv = file3
one_state_data = 0
outfile = csv.writer(open('self.outcsv', 'w'), delimiter=',')
state_city = read_csv(self.csv1)
city_data = read_csv(self.csv2)
all_state = list(set(state_city.state_name))
for one_state in all_state:
one_state_cities = list(state_city.loc[state_city.state_name == one_state, "city_name"])
one_state_data = 0
for one_city in one_state_cities:
one_city_data = city_data.loc[city_data.city_name == one_city, "population"].sum()
one_state_data = one_state_data + one_city_data
print one_state, one_state_data
outfile.writerows(whatever)

def output(file1, file2):
f = lambda x: x.strip() #strips newline and white space characters
with open(file1) as cities:
with open(file2) as states:
states_dict = {}
cities_dict = {}
for line in states:
line = line.split(',')
states_dict[f(line[0])] = f(line[1])
for line in cities:
line = line.split(',')
cities_dict[f(line[0])] = (int(f(line[1])) , int(f(line[2])))
for state , city in states_dict.iteritems():
try:
print state, cities_dict[city]
except KeyError:
pass
output(CSV1,CSV2) #these are the names of the files
This gives the output you wanted. Just make sure the names of cities in both files are the same in terms of capitalization.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Reading statistics from a .txt file and outputting them - python

Related

Python Regex Capturing Multiple Matches in separate observations

Continued Difficulties with Python Question

How can i split string?

Iterating over set of lists to find highest average in Python

Data Analysis using Python

Categories

Resources