Python CSV judge numbers' composition and replace it - python

I have a python script to run PostgreSQL and store its output in a CSV file. The script and the file looks like,
import sys, os
os.chdir('C:\Users\Heinz\Desktop')
print os.getcwd()
#set up psycopg2 environment
import psycopg2
#driving_distance module
query = """
select *
from driving_distance ($$
select
gid as id,
source::int4 as source,
target::int4 as target,
cost::double precision as cost,
rcost::double precision as reverse_cost
from network
$$, %s, %s, %s, %s
)
"""
#make connection between python and postgresql
conn = psycopg2.connect("dbname = 'TC_area' user = 'postgres' host = 'localhost' password = 'xxxx'")
cur = conn.cursor()
#count rows in the table
cur.execute("select count(*) from network")
result = cur.fetchone()
k = result[0] + 1 #number of points = number of segments + 1
#run loops
rs = []
i = 1
while i <= k:
cur.execute(query, (i, 100000000000, False, True))
rs.append(cur.fetchall())
i = i + 1
#import csv module
import csv
import tempfile
import shutil
j = 0
h = 0
ars = []
element = list(rs)
#export data to every row
filename = 'distMatrix.csv'
with open(filename, 'wb') as f:
writer = csv.writer(f, delimiter = ',')
while j <= k - 1:
while h <= k - 1:
rp = element[j][h][2]
ars.append(rp)
h = h + 1
else:
h = 0
writer.writerow(ars)
ars = []
j = j + 1
#concerning about flow-connection
with open(filename, 'rb') as f, tempfile.NamedTemporaryFile(mode='wb', delete=False) as g:
writer = csv.writer(g, delimiter = ',')
for row in csv.reader(f):
row = [element if float(element) < 10**6 else 0 for element in row]
writer.writerow(row)
shutil.move(g.name, filename)
conn.close()
The numbers in the CSV file are paths calculated by PostgreSQL, and I know they are all composed by the following numbers, let's call them generator
0, 1, 0.844, 0.69, 0.567, 0.387, 0.156, 0.31, 0.433, 0.613
I want to write some codes that can judge these 2 conditions, and then edit every field in this CSV file,
if numbers in the CSV file are just the same as one of the generator, then they stay the same as their original number
if numbers in the CSV file are not one of the generator, then the code can judge this number composed by what generator, for example, 2 = 1 + 1, and then change the addition to multiplication, for the last example, replace this number by 1 * 1
I think these additional codes should be implemented in this part of the script,
#export data to every row
filename = 'distMatrix.csv'
with open(filename, 'wb') as f:
writer = csv.writer(f, delimiter = ',')
while j <= k - 1:
while h <= k - 1:
rp = element[j][h][2]
ars.append(rp)
h = h + 1
else:
h = 0
writer.writerow(ars)
ars = []
j = j + 1
But how to do this task? Please give me some suggestions and hints, thank you.
I am using python 2.7.4 under Windows 8.1 x64.

The second part of your requirement is somewhat confusing. But it sounds like to me you need a generator function to provide values on demand from a list and a way to test if the number is in a list...
list = [ 0, 1, 0.844, 0.69, 0.567, 0.387, 0.156, 0.31, 0.433, 0.613 ]
def gen():
for i in range(len(list)):
yield list[i]
g = gen()
def test_the_number(nbr):
if nbr-int(nbr) in list:
print("Number in list")
else:
print(next(g))
nbr = 5 # not in list
test_the_number(nbr)
nbr =777 # also not in the list
test_the_number(nbr)
nbr = 0.844 # In the list
test_the_number(nbr)

Related

How do you replace segments in a line using fileinput

I am creating a program for counting coins and I want to create a mechanism which essentially scans a specifically written text file and is able to calculate whether it has been falsely counted but also will replace the ending segment of the line with either Y for Yes or N for No.
The txt file reads as such:
Abena,5p,325.00,Y
Malcolm,1p,3356.00,Y
Jane,£2,120.00,Y
Andy,£1,166.25,N
Sandip,50p,160.00,Y
Liz,20p,250.00,Y
Andy,20p,250.00,Y
Andy,50p,160.00,Y
Jane,£1,183.75,N
Liz,£,179.0,N
Liz,50p,170.0,N
Jane,50p,160.0,Y
Sandip,£1,183.0,N
Jane,£2,132.0,N
Abena,1p,3356.0,N
Andy,2p,250.0,N
Abena,£1,175.0,Y
Malcolm,50p,160.0,Y
Malcolm,£2,175.0,N
Malcolm,£1,175.0,Y
Malcolm,1p,356.0,Y
Liz,20p,250.0,Y
Jane,£2,120.0,Y
Jane,50p,160.0,Y
Andy,£1,175.0,Y
Abena,1p,359.56,N
Andy,5p,328.5,N
Andy,£2,108.0,N
Malcolm,£2,12.0,N
as you can see every line is split into 4 segments, I want the fileinput to be able to replace the fourth segment within the specified line.
My program (all the relevant things to see right now) is as follows:
class Volunteer:
def __init__(self, name, coin_type, weight_of_bag, true_count):
self.name = name
self.coin_type = coin_type # a function allowing me to class the data
self.weight_of_bag = weight_of_bag
self.true_count = true_count
just a simple object system to make things easier for later
with open("CoinCount.txt", "r", encoding="'utf-8'") as csvfile:
volunteers = []
for line in csvfile:
volunteers.append(Volunteer(*line.strip().split(',')))
just to create a list as well as an object for easier calculations
def runscan():
with open("CoinCount.txt", "r+", encoding='utf-8') as csvfile:
num_lines = 0
for line in csvfile:
num_lines = num_lines + 1
i = 0
while i < num_lines:
ct = (volunteers[i].coin_type)
wob = float(volunteers[i].weight_of_bag)
if ct == ("£2" or "2"):
accurate_weight = float(12.0)
limit = 10
bag_value = 10 * 12
elif ct == ("£1" or "1"):
accurate_weight = float(8.75)
limit = 20
bag_value = 20 * 8.75
elif ct == "50p":
accurate_weight = float(8)
limit = 20
bag_value = 20 * 8
elif ct == "20p":
accurate_weight = float(5)
limit = 50
bag_value = 5 * 50
elif ct == "10p":
accurate_weight = float(6.5)
limit = 50
bag_value = 6.5 * 50
elif ct == "5p":
accurate_weight = float(3.25)
limit = 100
bag_value = 3.25 * 100
elif ct == "2p":
accurate_weight = float(7.12)
limit = 50
bag_value = 50 * 7.12
elif ct == "1p":
accurate_weight = float(3.56)
limit = 100
bag_value = 3.56 * 100
number_of_bags = wob / bag_value
print("Number of bags on this is" + str(number_of_bags))
import fileinput
line = line[i]
if number_of_bags.is_integer():
with fileinput.FileInput('CoinCount.txt',inplace=True) as fileobj:
for line in fileobj:
x = line.split(',')
for w, word in enumerate(x):
if w == 3 and word == 'N':
print(line[i].replace('N', 'Y'), end='')
i = i + 1
else:
i = i + 1
else:
with fileinput.FileInput('CoinCount.txt',inplace=True) as fileobj:
for line in fileobj:
x = line.split(',')
for w, word in enumerate(x):
if w == 3 and word == 'Y':
print(line[i].replace('Y', 'N'), end='')
i = i + 1
else:
i = i + 1
and finally the thing Im having issues with, the scan function.
the issue is specifically within the last few lines of code here (the replacement part):
import fileinput
if number_of_bags.is_integer():
target, replacement = ('N', 'Y')
else:
target, replacement = ('Y', 'N')
with fileinput.FileInput('CoinCount.txt', inplace=True) as fileobj:
for i, line in enumerate(fileobj):
words = line.rstrip().split(',')
if line.words[3] == target:
line.words[3] = replacement
print(','.join(words))
i = i + 1
f = fileobj.lineno() # Number of lines processed.
print(f'Done, {f} lines processed')
I basically have created a function which goes down each line and calculates the next line down until there aren't anymore lines, the issue with the last part is that I am unable to replace the actual txt file, If I were to run this program right now the result would be a completely blank page. I know that the fix is most likely a simple but tricky discovery but It is really bothering me as this is all that is needed for my program to be complete.
I understand the majority of the coding used but I am very new to fileinput, I want to be able to go from each line and replace the final segment if the segments name (i.e "Y" or "N") given is inaccurate to the actual legitimate segment name as Y is for true and N is for false. Please help, I tried to make sure this question was as easily understandable as possible, please make your example relatable to my program
As far as I understood, the problem is whether the calculation of the weight is correct or not. So just create another file instead of using fileinput. Do you really need it ?
test.csv
Abena,5p,325.00,Y
Malcolm,1p,3356.00,Y
Read the csv and assign some header names
Remove the last column, we don't care if it's correct or not, we will calculate the result anyways
Gather your calculation function in one method, we will apply this to every row
Apply function to every row, if it's correct write "Y" else write "N"
Truncate the whole file and write it over
import pandas as pd
with open("test.csv", "r+") as f:
df = pd.read_csv(f, names=["name", "coin", "weight", "res"])
del df["res"]
def calculate(row):
if row["coin"] == "5p":
return "Y" if 3.25 * 100 == row["weight"] else "N"
elif row["coin"] == "1p":
return "Y" if 3.56 * 100 == row["weight"] else "N"
df["res"] = df.apply(lambda row: calculate(row), axis=1)
f.seek(0)
f.truncate()
df.to_csv(f, index=False, header=False)
test.csv
Abena,5p,325.0,Y
Malcolm,1p,3356.0,N

Python Pandas How to save output to csv

Hello now im working on my project. I want to get candidate of text block by using algorithm below.
My input is a csv document which contain :
HTML column : the html code in a line
TAG column : the tag of html code in a line
Words : the text inside the tag in aline
TC : the number of words in a line
LTC : the number of anchor words in a line
TG : the number of tag in a line
P : the number of tag p and br in a line
CTTD : TC + (0.2*LTC) + TG - P
CTTDs : the smoothed CTTD
This is my algorithm to find candidate of text block. I make the csv file into dataframe using pandas. I am using CTTDs,TC and TG column to find the candidate.
from ListSmoothing import get_filepaths_smoothing
import pandas as pd
import numpy as np
import csv
filenames = get_filepaths_smoothing(r"C:\Users\kimhyesung\PycharmProjects\newsextraction\smoothing")
index = 0
for f in filenames:
file_html=open(str(f),"r")
df = pd.read_csv(file_html)
#df = pd.read_csv('smoothing/Smoothing001.csv')
news = np.array(df['CTTDs'])
new = np.array(df['TG'])
minval = np.min(news[np.nonzero(news)])
maxval = np.max(news[np.nonzero(news)])
j = 0.2
thetaCTTD = minval + j * (maxval-minval)
#maxGap = np.max(new[np.nonzero(new)])
#minGap = np.min(new[np.nonzero(new)])
thetaGap = np.min(new[np.nonzero(new)])
#print thetaCTTD
#print maxval
#print minval
#print thetaGap
def create_candidates(df, thetaCTTD, thetaGAP):
k = 0
TB = {}
TC = 0
for index in range(0, len(df) - 1):
start = index
if df.ix[index]['CTTDs'] > thetaCTTD:
start = index
gap = 0
TC = df.ix[index]['TC']
for index in range(index + 1, len(df) - 1):
if df.ix[index]['TG'] == 0:
continue
elif df.ix[index]['CTTDs'] <= thetaCTTD and gap >= thetaGAP:
break
elif df.ix[index]['CTTDs'] <= thetaCTTD:
gap += 1
TC += df.ix[index]['TC']
if (TC < 1) or (start == index):
continue
TB.update({
k: {
'start': start,
'end': index - 1
}
})
k += 1
return TB
def get_unique_candidate(TB):
TB = tb.copy()
for key, value in tb.iteritems():
if key == len(tb) - 1:
break
if value['end'] == tb[key+1]['end']:
del TB[key+1]
elif value['start'] < tb[key+1]['start'] < value['end']:
TB[key]['end'] = tb[key+1]['start'] - 1
else:
continue
return TB
index += 1
stored_file = "textcandidate/textcandidate" + '{0:03}'.format(index) + ".csv"
tb = create_candidates(df, thetaCTTD, thetaGap)
TB = get_unique_candidate(tb)
filewrite = open(stored_file, "wb")
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
output_df.to_csv(stored_file)
writer = csv.writer(filewrite, lineterminator='\n')
filewrite.close
ThetaCTTD is 10.36 and thethaGap is 1.
The output is
The output means there are 2 candidates of text block . First the candiate of text block start from line number 215 and end line number 225 (like the pict bellow). And the other candidate of text block start from line number 500 and end line number 501.
My question is how to save the output into csv and not only the number of line but the range of the text block and the others column will appear as the output too?
My expected output is like the screenshot of candidate text block is like this one
Assuming your output is a list of dictionaries:
pd.concat([df.loc[d['start']:d['end']] for (k, d) in TB.iteritems()])
Note that we slice by label, so d['end'] will be included.
Edit: add the candidate number in a new column.
It's cleaner to write a loop than to do two concat operations:
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
It's also faster to concatenate all dataframes at once at the end.

Loop through csv rows and check for a specific value

Hello I got a question regarding loops. The situation now is that I got a csv file where I check whether in column3 (row[2]) the value "1" is present. If not just skip it and loop again with add up value:
i = 1
maxuserid = 7255
result_liked = []
with open('source/to/file/user_id%i.csv' %i,'r') as fin:
for row in csv.reader(fin, delimiter='\t'):
if int(row[2]) >= 1:
result_liked.append(row)
i += 1
else:
i += 1
#more code
The thing is that I need a for loop that runs all the code and after the run is completed add the value "1" up to my i variable.
The goal of my code is to run the whole code and after it is done I want to add up the value i from 1 to 2 and run the loop again, untill the maxuserid of 7255 is reached. How can I get a loop that does this from 1 till 7255?
EDIT:
import csv
maxuserid = 7255
result_liked = []
for i in range(maxuserid):
with open('source/to/file/user_id%i.csv' %(i+1),'r') as fin:
for row in csv.reader(fin, delimiter='\t'):
if int(row[2]) >= 1:
result_liked.append(row)
training_data = result_liked[:2]
test_data = result_liked[2:]
training_data_bookid = [el[1] for el in training_data]
test_data_bookid = [el[1] for el in test_data]
#training_data_bookid_int = map(int, training_data_bookid) #python2
training_data_bookid_int = list(map(int, training_data_bookid)) #python3
test_data_bookid_int = list(map(int, test_data_bookid)) #python3
books_list = []
for j in range(0,2):
with open('source/to/file/output_new.csv', 'rt') as f:
reader = csv.reader(f, delimiter=',', quotechar='"')
for row in reader:
get_book_id = training_data_bookid_int[j]
if get_book_id == int(row[0]):
books_list.append([row[2],row[1]])
b = sorted(books_list, reverse=True, key=lambda x:int(x[0]))
c = [el[1] for el in b]
c_int = list(map(int, c))
check_training_vs_test = set(c_int) & set(test_data_bookid_int)
with open("result.txt", "a") as text_file:
text_file.write("Userid: %i || Liked: %s || Test: %f" % (i, len(test_data), len(check_training_vs_test)))
Try following code
maxuserid = 7255
result_liked = []
for i in range(maxuserid): # this loop iterates through all users files
with open('source/to/file/user_id%d.csv' % (i+1),'r') as fin:
for row in csv.reader(fin, delimiter='\t'):
if int(row[2]) >= 1:
result_liked.append(row)
Update
I think you need something like:
maxuserid = 7255
for i in range(maxuserid):
result_liked = [] # form a separate list for each csv file
with open('source/to/file/user_id%i.csv' %(i+1),'r') as fin:
for row in csv.reader(fin, delimiter='\t'):
if int(row[2]) >= 1:
result_liked.append(row)
if len(result_liked) < 3: # if list too few elements just go to next file
continue
training_data = result_liked[:2]
test_data = result_liked[2:]
...

Cant make a txt output at proper format

This is my code. The problem is that the output looks like this
2015-06-03 19:32:11.225085
{'2015-01-21-20:56:45.mp3': 1}{'negative': -2}{'2015-01-15-21:28:23.mp3': 1}
i want to be like a dictionary.... like this one below so i can read it back as a dictionary and remove the keys from the first subset sum and go on output a second on and so on until no other subset sum exists...
2015-06-03 19:32:11.225085
{'2015-01-21-20:56:45.mp3': 1, 'negative': -2, '2015-01-15-21:28:23.mp3': 1}
Any ideas?
thanx in advanced
import os, sys,re,gzip, pickle
from itertools import combinations
import json
from datetime import datetime
mp3folder = raw_input('Please copy paste the mp3s path:')
lowerin = input('Please enter your total playlist time in NEGATIVE seconds and hit ENTER:')
r = {}
drk = os.listdir(mp3folder)
drifiles = list(drk)
r = dict.fromkeys(drifiles, 0)
for key in r.keys():
print ('Please enter the duration of...')
print(key)
r[key] = input('in seconds and hit ENTER:')
r['negative'] = lowerin
d = {}
neg = 0
pos = 0
dates = datetime.now()
dates = str(dates)
f = open("dict.txt",'ab')
f.write('\n'+dates+'\n')
f.close()
for (w,v) in r.iteritems():
if v > 0: pos += v
else: neg += v
sums = [0] * (pos - neg + 1)
for (w,v) in r.iteritems():
s = sums[:]
if not s[v - neg]: s[v - neg] = (w,)
for (i, w2) in enumerate(sums):
if w2 and not s[i + v]:
s[i + v] = w2 + (w,)
sums = s
if s[-neg]:
for x in s[-neg]:
d = dict([(x, r[x])])
file('dict.txt','a'+'\n').write(repr(d))
break
f = open('dict.txt','r')
filedata = f.read()
f.close()
newdata = filedata.replace("}{",", ")
f = open('lexiko.txt','w')
f.write(newdata)
f.close()
di = eval(open("lexiko.txt").read())
print di
this will do it

Printing values following comparison of two csv files only if in a specific range using Python 3.3

I'm new at programming and I've got two CSV files that I'm trying to compare. The first file, snp.csv is shown below:
chrom position ref var gene var
1 21421 G T WASH7P snp.LOH
1 1251593 T C CPSF3L snp.somatic
6 107474777 - A PDSS2 indel.somatic
14 106586168 G T ADAM6 snp.LOH
The second file, quad.csv is shown below:
chrom Start End Sequence
1 21420 21437 GGGACGGGGAGGGTTGGG
1 23058 23078 GGGCTGGGGCGGGGGGAGGG
1 23515 23534 GGGAAGGGACAGGGCAGGG
1 45098 45118 GGGAAAGGGCAGGGCCCGGG
3 1148 1173 GGGCCGGGCAAGGCCGGGTGCAGGG
I want to compare these two files and if the two chrom values match, I want to print only those having position value (in snp.csv file) in the range of the start and end value (in the quad.csv file).
So, I am looking for a solution that will give me something like the following (basically the snp.csv file with start, end and sequence value of the quad.csv file)
chrom position ref var gene var Start End Sequence
1 21421 G T WASH7P snp.LOH 21420 21437 GGGACGGGGAGGGTTGGG
I've searched the posts and found some interesting answers that helped me a lot but I’m still experiencing some issues. I’m still learning Python…
Here is my script up to now, I know I have a problem with the range function...I'm stuck
import csv
snp_file = open("snp.csv", "r")
quad_file = open("quad.csv", "r")
out_file = open("results.csv", "wb")
snp = csv.reader(snp_file, delimiter='\t')
quad = csv.reader(quad_file, delimiter='\t')
out = csv.reader(out_file, delimiter='\t')
quadlist = [row for row in quad]
for snp_row in snp:
row = 1
found = False
for quad_row in quadlist:
results_row = snp_row
if snp_row[0] == quad_row[0]:
quad_pos = range(quad_row[1], quad_row[2])
if snp_row[1] in quad_pos:
results_row.append(quad_row)
found = True
break
row = row + 1
if not found:
pass
print (results_row)
snp.close()
quad.close()
out.close()
from bisect import bisect_right
from collections import defaultdict
import csv
TOO_HIGH = 2147483647 # higher than any actual gene position
SNP_FMT = "{0:<7} {1:<11} {2:3} {3:3} {4:11} {5:15}".format
QUAD_FMT = " {1:<7} {2:<7} {3}".format
def line_to_quad(line):
row = line.split()
return int(row[0]), int(row[1]), int(row[2]), row[3]
def line_to_snp(line):
row = line.split()
return int(row[0]), int(row[1]), row[2], row[3], row[4], row[5]
class Quads:
#classmethod
def from_file(cls, fname):
with open(fname, "rU") as inf:
next(inf, None) # skip header line
quads = (line_to_quad(line) for line in inf)
return cls(quads)
def __init__(self, rows):
self.chromosomes = defaultdict(list)
for row in rows:
self.chromosomes[row[0]].append(row[1:])
for segs in self.chromosomes.values():
segs.sort()
def find_match(self, chromosome, position):
segs = self.chromosomes[chromosome]
index = bisect_right(segs, (position, TOO_HIGH, "")) - 1
try:
seg = segs[index]
if seg[0] <= position <= seg[1]:
return (chromosome,) + seg
except IndexError:
pass
def main():
quads = Quads.from_file("quad.csv")
print( # header
SNP_FMT("chrom", "position", "ref", "var", "gene", "var") +
QUAD_FMT("chrom", "Start", "End", "Sequence")
)
with open("snp.csv") as inf:
next(inf, None) # skip header line
for line in inf:
snp = line_to_snp(line)
quad = quads.find_match(snp[0], snp[1])
if quad:
print(SNP_FMT(*snp) + QUAD_FMT(*quad))
if __name__=="__main__":
main()
which gives
chrom position ref var gene var Start End Sequence
1 21421 G T WASH7P snp.LOH 21420 21437 GGGACGGGGAGGGTTGGG

Categories

Resources