Data Analysis using Python - python

I have 2 CSV files. One with city name, population and humidity. In second cities are mapped to states. I want to get state-wise total population and average humidity. Can someone help? Here is the example:
CSV 1:
CityName,population,humidity
Austin,1000,20
Sanjose,2200,10
Sacramento,500,5
CSV 2:
State,city name
Ca,Sanjose
Ca,Sacramento
Texas,Austin
Would like to get output(sum population and average humidity for state):
Ca,2700,7.5
Texas,1000,20

The above solution doesn't work because dictionary will contain one one key value. i gave up and finally used a loop. below code is working, mentioned input too
csv1
state_name,city_name
CA,sacramento
utah,saltlake
CA,san jose
Utah,provo
CA,sanfrancisco
TX,austin
TX,dallas
OR,portland
CSV2
city_name population humidity
sacramento 1000 1
saltlake 300 5
san jose 500 2
provo 100 7
sanfrancisco 700 3
austin 2000 4
dallas 2500 5
portland 300 6
def mapping_within_dataframe(self, file1,file2,file3):
self.csv1 = file1
self.csv2 = file2
self.outcsv = file3
one_state_data = 0
outfile = csv.writer(open('self.outcsv', 'w'), delimiter=',')
state_city = read_csv(self.csv1)
city_data = read_csv(self.csv2)
all_state = list(set(state_city.state_name))
for one_state in all_state:
one_state_cities = list(state_city.loc[state_city.state_name == one_state, "city_name"])
one_state_data = 0
for one_city in one_state_cities:
one_city_data = city_data.loc[city_data.city_name == one_city, "population"].sum()
one_state_data = one_state_data + one_city_data
print one_state, one_state_data
outfile.writerows(whatever)

def output(file1, file2):
f = lambda x: x.strip() #strips newline and white space characters
with open(file1) as cities:
with open(file2) as states:
states_dict = {}
cities_dict = {}
for line in states:
line = line.split(',')
states_dict[f(line[0])] = f(line[1])
for line in cities:
line = line.split(',')
cities_dict[f(line[0])] = (int(f(line[1])) , int(f(line[2])))
for state , city in states_dict.iteritems():
try:
print state, cities_dict[city]
except KeyError:
pass
output(CSV1,CSV2) #these are the names of the files
This gives the output you wanted. Just make sure the names of cities in both files are the same in terms of capitalization.

Related

Reading statistics from a .txt file and outputting them

I am supposed to get certain information from a .txt file and output it. This is the information I need:
State with the maximum population
State with the minimum population
Average state population
State of Texas population
The DATA looks like:
Alabama
AL
4802982
Alaska
AK
721523
Arizona
AZ
6412700
Arkansas
AR
2926229
California
CA
37341989
This is my code that does not really do anything I need it to do:
def main():
# Open the StateCensus2010.txt file.
census_file = open('StateCensus2010.txt', 'r')
# Read the state name
state_name = census_file.readline()
while state_name != '':
state_abv = census_file.readline()
population = int(census_file.readline())
state_name = state_name.rstrip('\n')
state_abv = state_abv.rstrip('\n')
print('State Name: ', state_name)
print('State Abv.: ', state_abv)
print('Population: ', population)
print()
state_name = census_file.readline()
census_file.close()
main()
All I have it doing is reading the state name, abv and converting the population into an int. I don't need it to do anything of that, however I'm unsure how to do what the assignment is asking. Any hints would definitely be appreciated! I've been trying some things for the past few hours to no avail.
Update:
This is my updated code however I'm receving the following error:
Traceback (most recent call last):
File "main.py", line 13, in <module>
if population > max_population:
TypeError: unorderable types: str() > int()
Code:
with open('StateCensus2010.txt', 'r') as census_file:
while True:
try:
state_name = census_file.readline()
state_abv = census_file.readline()
population = int(census_file.readline())
except IOError:
break
# data processing here
max_population = 0
for population in census_file:
if population > max_population:
max_population = population
print(max_population)
As the data is in consistent order; Statename, State Abv, Population. So you just need to read the lines one time, and display all three 3 information. Below is the sample code.
average = 0.0
total = 0.0
state_min = 999999999999
state_max = 0
statename_min = ''
statename_max = ''
texas_population = 0
with open('StateCensus2010.txt','r') as file:
# split new line, '\n' here means newline
data = file.read().split('\n')
# get the length of the data by using len() method
# there are 50 states in the text file
# each states have 3 information stored,
# state name, state abreviation, population
# that's why length of data which is 150/3 = 50 states
state_total = len(data)/3
# this count is used as an index for the list
count = 0
for i in range(int(state_total)):
statename = data[count]
state_abv = data[count+1]
population = int(data[count+2])
print('Statename : ',statename)
print('State Abv : ',state_abv)
print('Population: ',population)
print()
# sum all states population
total += population
if population > state_max:
state_max = population
statename_max = statename
if population < state_min:
state_min = population
statename_min = statename
if statename == 'Texas':
texas_population = population
# add 3 because we want to jump to next state
# for example the first three lines is Alabama info
# the next three lines is Alaska info and so on
count += 3
# divide the total population with number of states
average = total/state_total
print(str(average))
print('Lowest population state :', statename_min)
print('Highest population state :', statename_max)
print('Texas population :', texas_population)
This problem is pretty easy using pandas.
Code:
states = []
for line in data:
states.append(
dict(state=line.strip(),
abbrev=next(data).strip(),
pop=int(next(data)),
)
)
df = pd.DataFrame(states)
print(df)
print('\nmax population:\n', df.ix[df['pop'].idxmax()])
print('\nmin population:\n', df.ix[df['pop'].idxmin()])
print('\navg population:\n', df['pop'].mean())
print('\nAZ population:\n', df[df.abbrev == 'AZ'])
Test Data:
from io import StringIO
data = StringIO(u'\n'.join([x.strip() for x in """
Alabama
AL
4802982
Alaska
AK
721523
Arizona
AZ
6412700
Arkansas
AR
2926229
California
CA
37341989
""".split('\n')[1:-1]]))
Results:
abbrev pop state
0 AL 4802982 Alabama
1 AK 721523 Alaska
2 AZ 6412700 Arizona
3 AR 2926229 Arkansas
4 CA 37341989 California
max population:
abbrev CA
pop 37341989
state California
Name: 4, dtype: object
min population:
abbrev AK
pop 721523
state Alaska
Name: 1, dtype: object
avg population:
10441084.6
AZ population:
abbrev pop state
2 AZ 6412700 Arizona
Another pandas solution, from the interpreter:
>>> import pandas as pd
>>>
>>> records = [line.strip() for line in open('./your.txt', 'r')]
>>>
>>> df = pd.DataFrame([records[i:i+3] for i in range(0, len(records), 3)],
... columns=['State', 'Code', 'Pop']).dropna()
>>>
>>> df['Pop'] = df['Pop'].astype(int)
>>>
>>> df
State Code Pop
0 Alabama AL 4802982
1 Alaska AK 721523
2 Arizona AZ 6412700
3 Arkansas AR 2926229
4 California CA 37341989
>>>
>>> df.ix[df['Pop'].idxmax()]
State California
Code CA
Pop 37341989
Name: 4, dtype: object
>>>
>>> df.ix[df['Pop'].idxmin()]
State Alaska
Code AK
Pop 721523
Name: 1, dtype: object
>>>
>>> df['Pop'].mean()
10441084.6
>>>
>>> df.ix[df['Code'] == 'AZ' ]
State Code Pop
2 Arizona AZ 6412700
Please try this the earlier code was not python 3 compatible. It supported python 2.7
def extract_data(state):
total_population = 0
for states, stats in state.items():
population = stats.get('population')
state_name = stats.get('state_name')
states = states
total_population = population + total_population
if 'highest' not in vars():
highest = population
higherst_state_name = state_name
highest_state = states
if 'lowest' not in vars():
lowest = population
lowest_state_name = state_name
lowest_state = states
if highest < population:
highest = population
higherst_state_name = state_name
highest_state = states
if lowest > population:
lowest = population
lowest_state_name = state_name
lowest_state = states
print(highest_state, highest)
print(lowest_state, lowest)
print(len(state))
print(int(total_population/len(state)))
print(state.get('TX').get('population'))
def main():
# Open the StateCensus2010.txt file.
census_file = open('states.txt', 'r')
# Read the state name
state_name = census_file.readline()
state = {}
while state_name != '':
state_abv = census_file.readline()
population = int(census_file.readline())
state_name = state_name.rstrip('\n')
state_abv = state_abv.rstrip('\n')
if state_abv in state:
state[state_abv].update({'population': population, 'state_name': state_name})
else:
state.setdefault(state_abv,{'population': population, 'state_name': state_name})
state_name = census_file.readline()
census_file.close()
return state
state=main()
extract_data(state)

how do i divide list into smallers list

My list is formatted like:
gymnastics_school,participant_name,all-around_points_earned
I need to divide it up by schools but keep the scores.
import collections
def main():
names = ["gymnastics_school", "participant_name", "all_around_points_earned"]
Data = collections.namedtuple("Data", names)
data = []
with open('state_meet.txt','r') as f:
for line in f:
line = line.strip()
items = line.split(',')
items[2] = float(items[2])
data.append(Data(*items))
These are examples of how they're set up:
Lanier City Gymnastics,Ben W.,55.301
Lanier City Gymnastics,Alex W.,54.801
Lanier City Gymnastics,Sky T.,51.2
Lanier City Gymnastics,William G.,47.3
Carrollton Boys,Cameron M.,61.6
Carrollton Boys,Zachary W.,58.7
Carrollton Boys,Samuel B.,58.6
La Fayette Boys,Nate S.,63
La Fayette Boys,Kaden C.,62
La Fayette Boys,Cohan S.,59.1
La Fayette Boys,Cooper J.,56.101
La Fayette Boys,Avi F.,53.401
La Fayette Boys,Frederic T.,53.201
Columbus,Noah B.,50.3
Savannah Metro,Levi B.,52.801
Savannah Metro,Taylan T.,52
Savannah Metro,Jacob S.,51.5
SAAB Gymnastics,Dawson B.,58.1
SAAB Gymnastics,Dean S.,57.901
SAAB Gymnastics,William L.,57.101
SAAB Gymnastics,Lex L.,52.501
Suwanee Gymnastics,Colin K.,57.3
Suwanee Gymnastics,Matthew B.,53.201
After processing it should look like:
Lanier City Gymnastics:participants(4)
as it own list
Carrollton Boys(3)
as it own list
La Fayette Boys(6)
etc.
I would recommend putting them in dictionaries:
data = {}
with open('state_meet.txt','r') as f:
for line in f:
line = line.strip()
items = line.split(',')
items[2] = float(items[2])
if items[0] in data:
data[items[0]].append(items[1:])
else:
data[items[0]] = [items[1:]]
Then access schools could be done in the following way:
>>> data['Lanier City Gymnastics']
[['Ben W.',55.301],['Alex W.',54.801],['Sky T'.,51.2],['William G.',47.3]
EDIT:
Assuming you need the whole dataset as a list first, then you want to divide it into smaller lists you can generate the dictionary from the list:
data = []
with open('state_meet.txt','r') as f:
for line in f:
line = line.strip()
items = line.split(',')
items[2] = float(items[2])
data.append(items)
#perform median or other operation on your data
nested_data = {}
for items in data:
if items[0] in data:
data[items[0]].append(items[1:])
else:
data[items[0]] = [items[1:]]
nested_data[item[0]]
When you need to get a subset of a list you can use slicing:
mylist[start:stop:step]
where start, stop and step are optional (see link for more comprehensive introduction)

Biopython translate() error

I have a file that looks as so:
Type Variant_class ACC_NUM dbsnp genomic_coordinates_hg18 genomic_coordinates_hg19 HGVS_cdna HGVS_protein gene disease sequence_context_hg18 sequence_context_hg19 codon_change codon_number intron_number site location location_reference_point author journal vol page year pmid entrezid sift_score sift_prediction mutpred_score
1 DM CM920001 rs1800433 null chr12:9232351:- NM_000014.4 NP_000005.2:p.C972Y A2M Chronicobstructivepulmonarydisease null CACAAAATCTTCTCCAGATGCCCTATGGCT[G/A]TGGAGAGCAGAATATGGTCCTCTTTGCTCC TGT TAT 972 null null 2 null Poller HUMGENET 88 313 1992 1370808 2 0 DAMAGING 0.594315245478036
1 DM CM004784 rs74315453 null chr22:43089410:- NM_017436.4 NP_059132.1:p.M183K A4GALT Pksynthasedeficiency(pphenotype) null TGCTCTCCGACGCCTCCAGGATCGCACTCA[T/A]GTGGAAGTTCGGCGGCATCTACCTGGACAC ATG AAG 183 null null 2 null Steffensen JBC 275 16723 2000 10747952 53947 0 DAMAGING 0.787878787878788
I want to translate the information from column 13 and 14 to their corresponding amino acids. Here is the script that I've generated:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
InFile = open("disease_mut_splitfinal.txt", 'rU')
InFile.readline()
OriginalSeq_list = []
MutSeq_list = []
import csv
with open("disease_mut_splitfinal.txt") as f:
reader = csv.DictReader(f, delimiter= "\t")
for row in reader:
OriginalSeq = row['codon_change']
MutSeq = row['codon_number']
region = row["genomic_coordinates_hg19"]
gene = row["gene"]
OriginalSeq_list.append(OriginalSeq)
MutSeq_list.append(MutSeq)
OutputFileName = "Translated.txt"
OutputFile = open(OutputFileName, 'w')
OutputFile.write(''+region+'\t'+gene+'\n')
for i in range(0, len(OriginalSeq_list)):
OrigSeq = OriginalSeq_list[i]
MutSEQ = MutSeq_list[i]
print OrigSeq
translated_original = OrigSeq.translate()
translated_mut= MutSEQ.translate()
OutputFile.write("\n" + OriginalSeq_list[i]+ "\t" + str(translated_original) + "\t" +MutSeq_list[i] + "\t" + str(translated_mut)+ "\n")
However, I keep getting this error:
TypeError: translate expected at least 1 arguments, got 0
I'm kind of at a loss for what I'm doing wrong. Any suggestions?
https://www.dropbox.com/s/cd8chtacj3glb8d/disease_mut_splitfinal.txt?dl=0
(File should still be downloadable even if you don't have a dropbox)
You are using the string method "translate" instead of the biopython seq object method translate, which is what I assume you want to do. You need to convert the string into a seq object and then translate that. Try
from Bio import Seq
OrigSeq = Seq.Seq(OriginalSeq_list[i])
translated_original = OrigSeq.translate()
Alternatively
from Bio.Seq import Seq
OrigSeq = Seq(OriginalSeq_list[i])
translated_original = OrigSeq.translate()

How to compare a zero padded number in dictionary with a non zero padded number

I have two files and I need to compare both of them & update the value of the 1st file from the 2nd file.
My first file is as below,
SeqNo City State
1 Chicago IL
2 Boston MA
3 New York NY
4 Los Angeles CA
5 Seattle WA
My second file is as below,
SeqNo City State NewSeqNo
005 Seattle WA 001
001 Chicago IL 002
004 Los Angeles CA 003
002 Boston MA 004
003 New York NY 005
I have the following code to update the SEQ Number in the first file with the value in the NewSeqNo from the second file & save it as a third file. But it throws key error as SEQNO is zero padded in the second file where as its not in the first,
import csv
lookup = {}
with open('secondfile') as f:
reader = csv.reader(f)
for line in reader:
oldseq, city, state, newseq = line
lookup[oldseq] = newseq
with open('firstfile') as f, open('outfile','w') as w:
reader = csv.reader(f)
writer = csv.writer(w)
for line in reader:
seq, city, state = line
if seq in lookup:
seq = lookup[seq]
writer.writerow([seq, city, state])
For example, the output of the thirs file should be,
NewSeqNo City State
002 Chicago IL
004 Boston MA
005 New York NY
003 Los Angeles CA
001 Seattle WA
Any help is appreciated
Convert your 'numbers' to integers to remove the padding before storing in the dictionary:
import csv
lookup = {}
with open('secondfile') as f:
reader = csv.reader(f)
for line in reader:
oldseq, city, state, newseq = line
lookup[int(oldseq)] = newseq
with open('firstfile') as f, open('outfile','w') as w:
reader = csv.reader(f)
writer = csv.writer(w)
for line in reader:
seq, city, state = line
if int(seq) in lookup:
seq = lookup[int(seq)]
writer.writerow([seq, city, state])
Now lookup has integer keys, and when looking up matching keys in the second loop, we pass in integer keys again.
If you know that it is always padded for a length of 3, when reading your first file, you can convert your seq to an int and use format to write a padded value:
with open('firstfile') as f, open('outfile','w') as w:
reader = csv.reader(f)
writer = csv.writer(w)
for line in reader:
seq, city, state = line
# Convert to padded value
seq = "{:03}".format(int(seq))
if seq in lookup:
seq = lookup[seq]
writer.writerow([seq, city, state])
#!/usr/bin/python
old_dict = dict()
new_dict = dict()
with open('old', 'r') as fh:
for l in fh.readlines():
r = l.split()
if r:
old_dict.setdefault(int(r[0]), None)
old_dict[int(r[0])] = ' '.join(r[1:])
with open('new', 'r') as fh:
for l in fh.readlines():
r = l.split()
if r:
k = ' '.join(r[1:-1])
new_dict.setdefault(k, None)
new_dict[k] = int(r[-1])
for i,j in old_dict.iteritems():
d = j.split()
print '%0.3d %s %s' % (new_dict[j], ' '.join(d[0:-1]), d[-1])
Output:
002 Chicago IL
004 Boston MA
005 New York NY
003 Los Angeles CA
001 Seattle WA

Handling word index in text files

wordlist A: book jesus christ son david son abraham jacob judah his brothers perez amminadab
wordlist B: akwụkwọ jizọs kraịst nwa devid nwa ebreham jekọb juda ya ụmụnne pirez aminadab
file.txt A:
the book of the history of jesus christ , son of david , son of abraham :
abraham became father to isaac ; isaac became father to jacob ; jacob became father to judah and his brothers ;
file.txt B:
akwụkwọ nke kọrọ akụkọ banyere jizọs kraịst , nwa devid , nwa ebreham :
ebreham mụrụ aịzik ; aịzik amụọ jekọb ; jekọb amụọ juda na ụmụnne ya ndị ikom ;
I have 2 above word-lists (say A & B) of 2 diff. languages. Both contain word translation of each other in order. My task is to run these word-lists through 2 separate files.txt of both languages like word-list A through file.txt A and vice versa, then return a line for both txt files, each will contain the index numbers of both word-list where they were found on each line of the txt paired like:
2:1 7:6 8:7 10:9 12:10 14:12 16:13 [ 2:1 = 2 index of book in txt.file A and 1-akwụkwọ in txt.file B and so on]
1:1 11:6 13:8 17:10 19:12 20:13 [ 1:1 = 1 index of abraham in txt.file A and 1- ebreham in txt.file B and so on].
see codes below:
import sys
def wordlist(filename):
wordlist = []
with open(filename, 'rb') as f:
for line in f:
wordlist.append(line)
return wordlist
eng = []
for lines in open('eng_try.txt', 'rb'):
line = lines.strip()
eng.append(line)
igb = []
for lines in open('igb_try.txt', 'rb'):
line = lines.strip()
igb.append(line)
i = 0
while i < len(eng):
eng_igb_verse_pair = eng[i] + " " + igb[i]
line = eng_igb_verse_pair.strip().split()
for n in range(0, len(wordlist('eng_wordlist.txt'))):
eng_word = wordlist('eng_wordlist.txt').pop(n)
igb_word = wordlist('igb_wordlist.txt').pop(n)
if eng_word in line and igb_word in line:
print '{0} {1}:{2}'.format(i, line.index[eng_word], line.index[igb_word])
i += 1
This actually prints empty. I know my problem is in the last segment of the program. Can someone help. I am not that experienced python programmer. Apologies if I didn't construct my explanation well.
You mean something like this:
import sys
def checkLine(line_eng, line_igb):
eng_words = line_eng.split()
igb_words = line_igb.split()
for word in eng_words:
if word in eng:
igb_word = igb[eng.index(word)]
print "%d:%d" % ( eng_words.index(word)+1, igb_words.index(igb_word)+1),
def linelist(filename):
lineslist = []
for line in open(filename, 'rb'):
lineslist.append(line)
return lineslist
eng = []
for lines in open('eng_try.txt', 'rb'):
line = lines.strip()
for w in line.split():
eng.append(w)
igb = []
for lines in open('igb_try.txt', 'rb'):
line = lines.strip()
for w in line.split():
igb.append(w)
eng_lines = linelist("eng_wordlist.txt")
igb_lines = linelist("igb_wordlist.txt")
for n in range(0, len(eng_lines)):
print "%d. " % (n+1),
checkLine(eng_lines[n],igb_lines[n])
print
For your files i got result:
1. 2:1 7:6 8:7 10:9 12:10 10:9 16:13
2. 1:1 11:7 11:7 17:11 19:14 20:13
BR
Parasit Hendersson

Categories

Resources