Parsing data from a file

Parsing data from a file - python

I have been provided with a file containing data on recorded sightings of species, which is laid out in the format;
"Species", "\t", "Latitude", "\t", "Longitude"
I need to define a function that will load the data from the file into a list, whilst for every line in the list spiting it into three components, species name, latitude and longitude.
This is what i have but it is not working:
def LineToList(FileName):
FileIn = open(FileName, "r")
DataList = []
for Line in FileIn:
Line = Line.rstrip()
DataList.append(Line)
EntryList = []
for Entry in Line:
Entry = Line.split("\t")
EntryList.append(Entry)
FileIn.close()
return DataList
LineToList("Mammal.txt")
print(DataList[1])
I need the data on each line to be separated so that i can use it later to calculate where the species was located within a certain distance of a given location.
Sample Data:
Myotis nattereri 54.07663633 -1.006446707
Myotis nattereri 54.25637837 -1.002130504
Myotis nattereri 54.25637837 -1.002130504
I am Trying to print one line of the data set to test if it is splittiing correctly but nothing is showing in the shell
Update:
This is the code i am working with now;
def LineToList(FileName):
FileIn = open(FileName, "r")
DataList = []
for Line in FileIn:
Line = Line.rstrip()
DataList.append(Line)
EntryList = []
for Entry in Line:
Entry = Line.split("\t")
EntryList.append(Entry)
return EntryList
FileIn.close()
return DataList
def CalculateDistance(Lat1, Lon1, Lat2, Lon2):
Lat1 = float(Lat1)
Lon1 = float(Lon1)
Lat2 = float(Lat2)
Lon2 = float(Lon2)
nDLat = (Lat1 - Lat2) * 0.017453293
nDLon = (Lon1 - Lon2) * 0.017453293
Lat1 = Lat1 * 0.017453293
Lat2 = Lat2 * 0.017453293
nA = (math.sin(nDLat/2) ** 2) + math.cos(Lat1) * math.cos(Lat2) * (math.sin(nDLon/2) ** 2 )
nC = 2 * math.atan2(math.sqrt(nA),math.sqrt( 1 - nA ))
nD = 6372.797 * nC
return nD
DataList = LineToList("Mammal.txt")
for Line in DataList:
LocationCount = 0
CalculateDistance(Entry[1], Entry[2], 54.988056, -1.619444)
if CalculateDistance <= 10:
LocationCount += 1
print("Number Recordings within Location Range:", LocationCount)
When running the programme come up with an error:
CalculateDistance(Entry[1], Entry[2], 54.988056, -1.619444) NameError: name 'Entry' is not defined

I saw "Biological Sciences" in your profile and just because of that i would recommend you to take a closer look at Pandas module.
It can be very easy:
import pandas as pd
df = pd.read_csv('mammal.txt', sep='\t',
names=['species','lattitude','longitude'],
header=None)
print(df)
Output:
species lattitude longitude
0 Myotis nattereri 54.076636 -1.006447
1 Myotis nattereri 54.256378 -1.002131
2 Myotis nattereri 54.256378 -1.002131

Your DataList variable is local to the LineToList function; you have to assign to another variable at file scope:
DataList = LineToList("Mammal.txt")
print(DataList[1])

I think you have a regular tab-delimited CSV that csv.reader can easily parse for you.
import csv
DataList = [row for row in csv.reader(open('Mammal.txt'), dialect='excel-tab')]
for data in DataList:
print(data)
This results in
['Myotis nattereri', '54.07663633', '-1.006446707']
['Myotis nattereri', '54.25637837', '-1.002130504']
['Myotis nattereri', '54.25637837', '-1.002130504']

Related

Python - Write array output in .csv file for each for loop iteration

I'm trying to write the output data of this for loop in a .csv file. However, what gets written in the .csv file with the current code is the data of a single iteration. I want to append and write the data in a new row for each iteration. How do I do that?
for i in range(0,8):
# for j in range (0,2):
gyro_xy_orth = (np.dot(H[i,:], xy_orth)/np.square(xy_norm))*xy_orth
gyro_xy = H[i,:] - gyro_xy_orth
gyro_xz_orth = (np.dot(H[i,:], xz_orth)/np.square(xz_norm))*xz_orth
gyro_xz = H[i,:] - gyro_xz_orth
gyro_yz_orth = (np.dot(H[i,:], yz_orth)/np.square(yz_norm))*yz_orth
gyro_yz = H[i,:] - gyro_yz_orth
gyro_xy_m_orth = (np.dot(H[i,:], xy_m_orth)/np.square(xy_norm))*xy_m_orth
gyro_xy_m = H[i,:] - gyro_xy_m_orth
gyro_xz_m_orth = (np.dot(H[i,:], xz_m_orth)/np.square(xz_norm))*xz_m_orth
gyro_xz_m = H[i,:] - gyro_xz_m_orth
gyro_yz_m_orth = (np.dot(H[i,:], yz_m_orth)/np.square(yz_norm))*yz_m_orth
gyro_yz_m = H[i,:] - gyro_yz_m_orth
gyro_projections = [gyro_xy, gyro_xz, gyro_yz, gyro_xy_m, gyro_xz_m, gyro_yz_m]
#print("Projection of gyro ", i+1, " on planes xyz: ", gyro_projections)
with open('/Users/path/gyro_config.csv', 'w', newline='') as f:
writer = csv.writer(f, delimiter=',')
writer.writerows(gyro_projections)

Try to append instead of write mode:
with open('/Users/emanalawadhi/Documents/MBRSCWork/TASKSWork/SSDD/MBZsat/GYRO/gyro_config.csv', 'a', newline='') as f:

How to add two lists of dictionaries to jsonify?

I have 2 lists full of dictionaries in a FLASK app code. I'd like to return a response of type JSON with the contents of the two lists.
I'm trying to calculate distances between two locations by their LONGITUDE/LATITUDE properties from an input CSV file, and to respond with a JSON type file filled with the results.
Below is the processing and POST part of the application.
When I run it - I receive a TypeError : unhashable type: 'list' on the last line - return jsonify({points : points, links : links}).
##app.route("/get_address", methods = ['POST'])
def process_file(points):
def upload_file():
if request.method == 'POST':
points = request.files['file']
points.save(secure_filename(points.filename))
return points
def calculate_distance(lat1, lon1, lat2, lon2):
# approximate radius of earth in mm
radius = 6371.0 * 1000
dlat = radians(lat2-lat1)
dlon = radians(lon2-lon1)
a = sin(dlat/2) * sin(dlat/2) + cos(radians(lat1)) \
* cos(radians(lat2)) * sin(dlon/2) * sin(dlon/2)
c = 2 * atan2(sqrt(a), sqrt(1-a))
d = radius * c
return d
points = upload_file()
with open(points, newline='') as csvfile:
reader = csv.DictReader(csvfile)
points = []
links = []
for row in reader:
p = {"name": list(row.items())[0][1], "address (La/Lo)": (list(row.items())[1][1], list(row.items())[2][1])}
points.append(p)
l = []
for point in points:
for v in point.values():
l.append(v)
links = []
for i in range(0, len(l) -1 , 2):
if(i>0):
for j in range(i-2, 0, -2):
links.append({'name': l[i] + l[j], 'distance': calculate_distance(float(l[i+1][0]), float(l[i+1][1]), float(l[j+1][0]), float(l[j+1][1]))})
else:
for j in range(i+2, len(l), 2):
links.append({'name': l[i] + l[j], 'distance': calculate_distance(float(l[i+1][0]), float(l[i+1][1]), float(l[j+1][0]), float(l[j+1][1]))})
return jsonify({points : points, links : links})

The keys should be strings:
return jsonify({"points": points, "links": links})
{points : points, links : links} would be the right thing in Javascript.
Alternatively you can make a dict with string keys like so:
dict(points=points, links=links)
According to the jsonify docs, you can do this directly:
return jsonify(points=points, links=links)

python regex error: NameError: name 're' is not defined

I have some code that reads an infofile and extracts information using python's regex and writes it into a new file. When I test this portion of the code individually in its own script, it works perfectly. However when I add it to the rest of my code, I get this error:
NameError: name 're' is not defined
Below is my entire code. The regex portion is obvious (all the re.search commands):
import glob
import subprocess
import os
import datetime
import matplotlib.pyplot as plt
import csv
import re
import ntpath
x = open('data.txt', 'w')
m = open('graphing_data.txt', 'w')
ckopuspath= '/Volumes/DAVIS/sfit-ckopus/ckopus'
command_init = 'sfit4Layer0.py -bv5 -fh'
subprocess.call(command_init.split(), shell=False)
with open('/Volumes/DAVIS/calpy_em27_neu/spectra_out_demo/info.txt', 'rt') as infofile: # the info.txt file created by CALPY
for count, line in enumerate(infofile):
with open('\\_spec_final.t15', 'w') as t:
lat = re.search('Latitude of location:\s*([^;]+)', line, re.IGNORECASE).group(0)
lat = lat.split()
lat = lat[3]
lat = float(lat)
lon = re.search('Longitude of location:\s*([^;]+)', line, re.IGNORECASE).group(0)
lon = lon.split()
lon = lon[3]
lon = float(lon)
date = re.search('Time of measurement \(UTC\): ([^;]+)', line).group(0)
date = date.split()
yeardate = date[4]
yeardate = yeardate.split('-')
year = int(yeardate[0])
month = int(yeardate[1])
day = int(yeardate[2])
time = date[5]
time = time.split(':')
hour = int(time[0])
minute = int(time[1])
second = float(time[2])
dur = re.search('Duration of measurement \[s\]: ([^;]+)', line).group(0)
dur = dur.split()
dur = float(dur[4])
numpoints = re.search('Number of values of one scan:\s*([^;]+)', line, re.IGNORECASE).group(0)
numpoints = numpoints.split()
numpoints = float(numpoints[6])
fov = re.search('semi FOV \[rad\] :\s*([^;]+)', line, re.IGNORECASE).group(0)
fov = fov.split()
fov = fov[3]
fov = float(fov[1:])
sza = re.search('sun Azimuth \[deg\]:\s*([^;]+)', line, re.IGNORECASE).group(0)
sza = sza.split()
sza = float(sza[3])
snr = 0.0000
roe = 6396.2
res = 0.5000
lowwav = re.search('first wavenumber:\s*([^;]+)', line, re.IGNORECASE).group(0)
lowwav = lowwav.split()
lowwav = float(lowwav[2])
highwav = re.search('last wavenumber:\s*([^;]+)', line, re.IGNORECASE).group(0)
highwav = highwav.split()
highwav = float(highwav[2])
spacebw = (highwav - lowwav)/ numpoints
d = datetime.datetime(year, month, day, hour, minute, second)
t.write('{:>12.5f}{:>12.5f}{:>12.5f}{:>12.5f}{:>8.1f}'.format(sza,roe,lat,lon,snr)) # line 1
t.write("\n")
t.write('{:>10d}{:>5d}{:>5d}{:>5d}{:>5d}{:>5d}'.format(year,month,day,hour,minute,second)) # line 2
t.write("\n")
t.write( ('{:%Y/%m/%d %H:%M:%S}'.format(d)) + "UT Solar Azimuth:" + ('{:>6.3f}'.format(sza)) + " Resolution:" + ('{:>6.4f}'.format(res)) + " Duration:" + ('{:>6.2f}'.format(dur))) # line 3
t.write("\n")
t.write('{:>21.13f}{:>26.13f}{:>24.17e}{:>12f}'.format(lowwav,highwav,spacebw,numpoints)) # line 4
t.write("\n")
calpy_path = '/Volumes/DAVIS/calpy_em27_neu/spectra_out_demo/140803/*' # the CALPY output files!
files1 = glob.glob(calpy_path)
with open(files1[count], 'r') as g:
for line in g:
wave_no, intensity = [float(item) for item in line.split()]
if lowwav <= wave_no <= highwav:
t.write(str(intensity) + '\n')
##########################
subprocess.call(['sfit4Layer0.py', '-bv5', '-fs'],shell=False) #I think this writes the summary file
# this retrieves info from summary and outputs it into data.txt (for readability)
# and graphing_data.txt (for graphing)
road = '/Volumes/DAVIS/calpy_em27_neu/spectra_out_demo/sfit4_trial' # path to summary file that is produced - not sure where this is usually*
for infile in glob.glob(os.path.join(road, 'summary*')):
lines = open(infile, 'r').readlines()
#extract info from summary
x.write('{0} {1} {2} {3} {4}'.format(fitrms, chi2, dofsall, dofstrg, iter))
x.write('\n')
x.close()
m.close()

Python CSV judge numbers' composition and replace it

I have a python script to run PostgreSQL and store its output in a CSV file. The script and the file looks like,
import sys, os
os.chdir('C:\Users\Heinz\Desktop')
print os.getcwd()
#set up psycopg2 environment
import psycopg2
#driving_distance module
query = """
select *
from driving_distance ($$
select
gid as id,
source::int4 as source,
target::int4 as target,
cost::double precision as cost,
rcost::double precision as reverse_cost
from network
$$, %s, %s, %s, %s
)
"""
#make connection between python and postgresql
conn = psycopg2.connect("dbname = 'TC_area' user = 'postgres' host = 'localhost' password = 'xxxx'")
cur = conn.cursor()
#count rows in the table
cur.execute("select count(*) from network")
result = cur.fetchone()
k = result[0] + 1 #number of points = number of segments + 1
#run loops
rs = []
i = 1
while i <= k:
cur.execute(query, (i, 100000000000, False, True))
rs.append(cur.fetchall())
i = i + 1
#import csv module
import csv
import tempfile
import shutil
j = 0
h = 0
ars = []
element = list(rs)
#export data to every row
filename = 'distMatrix.csv'
with open(filename, 'wb') as f:
writer = csv.writer(f, delimiter = ',')
while j <= k - 1:
while h <= k - 1:
rp = element[j][h][2]
ars.append(rp)
h = h + 1
else:
h = 0
writer.writerow(ars)
ars = []
j = j + 1
#concerning about flow-connection
with open(filename, 'rb') as f, tempfile.NamedTemporaryFile(mode='wb', delete=False) as g:
writer = csv.writer(g, delimiter = ',')
for row in csv.reader(f):
row = [element if float(element) < 10**6 else 0 for element in row]
writer.writerow(row)
shutil.move(g.name, filename)
conn.close()
The numbers in the CSV file are paths calculated by PostgreSQL, and I know they are all composed by the following numbers, let's call them generator
0, 1, 0.844, 0.69, 0.567, 0.387, 0.156, 0.31, 0.433, 0.613
I want to write some codes that can judge these 2 conditions, and then edit every field in this CSV file,
if numbers in the CSV file are just the same as one of the generator, then they stay the same as their original number
if numbers in the CSV file are not one of the generator, then the code can judge this number composed by what generator, for example, 2 = 1 + 1, and then change the addition to multiplication, for the last example, replace this number by 1 * 1
I think these additional codes should be implemented in this part of the script,
#export data to every row
filename = 'distMatrix.csv'
with open(filename, 'wb') as f:
writer = csv.writer(f, delimiter = ',')
while j <= k - 1:
while h <= k - 1:
rp = element[j][h][2]
ars.append(rp)
h = h + 1
else:
h = 0
writer.writerow(ars)
ars = []
j = j + 1
But how to do this task？ Please give me some suggestions and hints, thank you.
I am using python 2.7.4 under Windows 8.1 x64.

The second part of your requirement is somewhat confusing. But it sounds like to me you need a generator function to provide values on demand from a list and a way to test if the number is in a list...
list = [ 0, 1, 0.844, 0.69, 0.567, 0.387, 0.156, 0.31, 0.433, 0.613 ]
def gen():
for i in range(len(list)):
yield list[i]
g = gen()
def test_the_number(nbr):
if nbr-int(nbr) in list:
print("Number in list")
else:
print(next(g))
nbr = 5 # not in list
test_the_number(nbr)
nbr =777 # also not in the list
test_the_number(nbr)
nbr = 0.844 # In the list
test_the_number(nbr)

Printing values following comparison of two csv files only if in a specific range using Python 3.3

I'm new at programming and I've got two CSV files that I'm trying to compare. The first file, snp.csv is shown below:
chrom position ref var gene var
1 21421 G T WASH7P snp.LOH
1 1251593 T C CPSF3L snp.somatic
6 107474777 - A PDSS2 indel.somatic
14 106586168 G T ADAM6 snp.LOH
The second file, quad.csv is shown below:
chrom Start End Sequence
1 21420 21437 GGGACGGGGAGGGTTGGG
1 23058 23078 GGGCTGGGGCGGGGGGAGGG
1 23515 23534 GGGAAGGGACAGGGCAGGG
1 45098 45118 GGGAAAGGGCAGGGCCCGGG
3 1148 1173 GGGCCGGGCAAGGCCGGGTGCAGGG
I want to compare these two files and if the two chrom values match, I want to print only those having position value (in snp.csv file) in the range of the start and end value (in the quad.csv file).
So, I am looking for a solution that will give me something like the following (basically the snp.csv file with start, end and sequence value of the quad.csv file)
chrom position ref var gene var Start End Sequence
1 21421 G T WASH7P snp.LOH 21420 21437 GGGACGGGGAGGGTTGGG
I've searched the posts and found some interesting answers that helped me a lot but I’m still experiencing some issues. I’m still learning Python…
Here is my script up to now, I know I have a problem with the range function...I'm stuck
import csv
snp_file = open("snp.csv", "r")
quad_file = open("quad.csv", "r")
out_file = open("results.csv", "wb")
snp = csv.reader(snp_file, delimiter='\t')
quad = csv.reader(quad_file, delimiter='\t')
out = csv.reader(out_file, delimiter='\t')
quadlist = [row for row in quad]
for snp_row in snp:
row = 1
found = False
for quad_row in quadlist:
results_row = snp_row
if snp_row[0] == quad_row[0]:
quad_pos = range(quad_row[1], quad_row[2])
if snp_row[1] in quad_pos:
results_row.append(quad_row)
found = True
break
row = row + 1
if not found:
pass
print (results_row)
snp.close()
quad.close()
out.close()

from bisect import bisect_right
from collections import defaultdict
import csv
TOO_HIGH = 2147483647 # higher than any actual gene position
SNP_FMT = "{0:<7} {1:<11} {2:3} {3:3} {4:11} {5:15}".format
QUAD_FMT = " {1:<7} {2:<7} {3}".format
def line_to_quad(line):
row = line.split()
return int(row[0]), int(row[1]), int(row[2]), row[3]
def line_to_snp(line):
row = line.split()
return int(row[0]), int(row[1]), row[2], row[3], row[4], row[5]
class Quads:
#classmethod
def from_file(cls, fname):
with open(fname, "rU") as inf:
next(inf, None) # skip header line
quads = (line_to_quad(line) for line in inf)
return cls(quads)
def __init__(self, rows):
self.chromosomes = defaultdict(list)
for row in rows:
self.chromosomes[row[0]].append(row[1:])
for segs in self.chromosomes.values():
segs.sort()
def find_match(self, chromosome, position):
segs = self.chromosomes[chromosome]
index = bisect_right(segs, (position, TOO_HIGH, "")) - 1
try:
seg = segs[index]
if seg[0] <= position <= seg[1]:
return (chromosome,) + seg
except IndexError:
pass
def main():
quads = Quads.from_file("quad.csv")
print( # header
SNP_FMT("chrom", "position", "ref", "var", "gene", "var") +
QUAD_FMT("chrom", "Start", "End", "Sequence")
)
with open("snp.csv") as inf:
next(inf, None) # skip header line
for line in inf:
snp = line_to_snp(line)
quad = quads.find_match(snp[0], snp[1])
if quad:
print(SNP_FMT(*snp) + QUAD_FMT(*quad))
if __name__=="__main__":
main()
which gives
chrom position ref var gene var Start End Sequence
1 21421 G T WASH7P snp.LOH 21420 21437 GGGACGGGGAGGGTTGGG

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Parsing data from a file - python

Your DataList variable is local to the LineToList function; you have to assign to another variable at file scope: DataList = LineToList("Mammal.txt") print(DataList[1])

Related

Python - Write array output in .csv file for each for loop iteration

How to add two lists of dictionaries to jsonify?

python regex error: NameError: name 're' is not defined

Python CSV judge numbers' composition and replace it

Printing values following comparison of two csv files only if in a specific range using Python 3.3

Categories

Resources