Loop through csv rows and check for a specific value - python

Hello I got a question regarding loops. The situation now is that I got a csv file where I check whether in column3 (row[2]) the value "1" is present. If not just skip it and loop again with add up value:
i = 1
maxuserid = 7255
result_liked = []
with open('source/to/file/user_id%i.csv' %i,'r') as fin:
for row in csv.reader(fin, delimiter='\t'):
if int(row[2]) >= 1:
result_liked.append(row)
i += 1
else:
i += 1
#more code
The thing is that I need a for loop that runs all the code and after the run is completed add the value "1" up to my i variable.
The goal of my code is to run the whole code and after it is done I want to add up the value i from 1 to 2 and run the loop again, untill the maxuserid of 7255 is reached. How can I get a loop that does this from 1 till 7255?
EDIT:
import csv
maxuserid = 7255
result_liked = []
for i in range(maxuserid):
with open('source/to/file/user_id%i.csv' %(i+1),'r') as fin:
for row in csv.reader(fin, delimiter='\t'):
if int(row[2]) >= 1:
result_liked.append(row)
training_data = result_liked[:2]
test_data = result_liked[2:]
training_data_bookid = [el[1] for el in training_data]
test_data_bookid = [el[1] for el in test_data]
#training_data_bookid_int = map(int, training_data_bookid) #python2
training_data_bookid_int = list(map(int, training_data_bookid)) #python3
test_data_bookid_int = list(map(int, test_data_bookid)) #python3
books_list = []
for j in range(0,2):
with open('source/to/file/output_new.csv', 'rt') as f:
reader = csv.reader(f, delimiter=',', quotechar='"')
for row in reader:
get_book_id = training_data_bookid_int[j]
if get_book_id == int(row[0]):
books_list.append([row[2],row[1]])
b = sorted(books_list, reverse=True, key=lambda x:int(x[0]))
c = [el[1] for el in b]
c_int = list(map(int, c))
check_training_vs_test = set(c_int) & set(test_data_bookid_int)
with open("result.txt", "a") as text_file:
text_file.write("Userid: %i || Liked: %s || Test: %f" % (i, len(test_data), len(check_training_vs_test)))

Try following code
maxuserid = 7255
result_liked = []
for i in range(maxuserid): # this loop iterates through all users files
with open('source/to/file/user_id%d.csv' % (i+1),'r') as fin:
for row in csv.reader(fin, delimiter='\t'):
if int(row[2]) >= 1:
result_liked.append(row)
Update
I think you need something like:
maxuserid = 7255
for i in range(maxuserid):
result_liked = [] # form a separate list for each csv file
with open('source/to/file/user_id%i.csv' %(i+1),'r') as fin:
for row in csv.reader(fin, delimiter='\t'):
if int(row[2]) >= 1:
result_liked.append(row)
if len(result_liked) < 3: # if list too few elements just go to next file
continue
training_data = result_liked[:2]
test_data = result_liked[2:]
...

Related

Lists in dictionary

I have a text file of format like this
10:45 a b c
x 0 1 2
y 4 5 6
z 7 8 9
I want to make x as key and 0,1,2 its value in the list.Same with y and z
while(1):
line = f.readline()
if time in line:
print (line)
L1.append(line)
for count,line in enumerate(f):
if (i < 3):
L1.append(line)
print ("Line{} : {}".format(count,line.strip()))
i=i+1
#print(L1)
for k in range(1):
print(L1[k])
test1 = L1[k]
a1 = test1.split()
print (a1[1])
dict = {a1[1]: L1[k] for a1[1] in L1[k]}
print (dict)
for k in range(1,3):
#print("hey")
print (L1[k]) #will list the single row
test = L1[k]
#print(test)
a = test.split()
print (a[0])
dict = {a[0]:L1[k] for a[0] in L1[k]}
print (dict)
Any idea what i am doing wrong here?
P.S. - I am new to python
You could try this:
my_dict = {}
lines = f.readLines()
lines.pop(0)
for line in lines:
line_list = line.split(' ')
key = line_list.pop(0)
my_dict.update({key: line_list})
This will accomplish what it is I think you need (assuming your text file is stored in the same directory and you replace 'test.txt' with your filename):
with open('test.txt', 'r') as values:
contents = values.read().strip().split()
new_dict = {}
i = 4
while i <= len(contents)-4:
new_dict.update({contents[i]: contents[i+1:i+4]})
i += 4
print(new_dict)
or this, if you want the values as integers:
with open('test.txt', 'r') as values:
contents = values.read().strip().split()
new_dict = {}
i = 4
while i <= len(contents)-4:
new_dict.update({contents[i]: [int(contents[i+1]),int(contents[i+2]),int(contents[i+3])]})
i += 4
print(new_dict)
Try this
import string
start_found = False
result_dict = {}
for line in open("stack1_input.txt", mode='r'):
if (line.startswith("10:45")):
start_found = True
continue
if start_found == True:
values = line.split()
if len(values) == 4:
result_dict[values[0]] = values[1:]
print (result_dict)

Writing data to files after few iteration in python

I want to write data into five files on every fifth iteration, is there any way to do that, I am confused how to fetch the past data
i=1
while 1:
data = random.randint(0,100)
print(data)
if(i%5==0):
with open('D:\mydata\my%d.csv'%(i-4),'D:\mydata\my%d.csv'%(i-3), "w") as csv_file:
writer = csv.writer(csv_file, delimiter=',')
level_counter = 0
max_levels = 1
while level_counter < max_levels:
filename1 = data
writer.writerow(("No load", filename1))
level_counter = level_counter +1
print("done")
i=i+1
time.sleep(2)
Just use a list to store data from the past 5 iterations:
i = 1
past_data = []
while True:
data = random.randint(0, 100)
past_data.append(data)
if i % 5 == 0:
...
past_data = []
i += 1

How to re-write a while statement as an if statement in Python 2.7?

I wrote a script that searches an excel document for 'X', and when it finds an 'X' it copies the first column and first row associated with the 'X' into a CSV file.
I've been told that there's a better way to do this with 'if' statements. Not quite sure how.
Here's the code:
import xlrd
import csv
###Grab the data from sheet 1
def get_row_values(workSheet, row):
to_return = []
num_cells = myWorksheet.ncols - 1
curr_cell = -1
while curr_cell < num_cells:
curr_cell += 1
cell_value = myWorksheet.cell_value(row, curr_cell)
to_return.append(cell_value)
return to_return
file_path = 'foo.xlsx'
output = []
#Write the data
myWorkbook = xlrd.open_workbook(file_path)
myWorksheet = myWorkbook.sheet_by_name('foosheet')
num_rows = myWorksheet.nrows - 1
curr_row = 0
column_names = get_row_values(myWorksheet, curr_row)
#print("TOTAL ENTRIES:")
#print len(column_names)
#print("-----")
framework_name = myWorksheet.cell(0,2)
framework_version = myWorksheet.cell(0,3)
while curr_row < num_rows:
curr_row += 1
row = myWorksheet.row(curr_row)
this_row = get_row_values(myWorksheet, curr_row)
x = 0
while x <len(this_row):
if this_row[x] == 'x':
output.append(['', fooname, foo_version,
foo_names[x], foo_row[0]])
myData = [["foo1", "foo2",
"foo3", "foo4", "foo5"]]
myFile = open('./results/barTemp.csv', 'w')
with myFile:
writer = csv.writer(myFile)
writer.writerows(myData)
writer.writerows(output)
x += 1
#print output
myFile.close()
myWorkbook.release_resources()
Its not necessarily better. Still the same runtime-complexity.
The difference would be a more compact line:
For example, you can change
while x < len(this_row):
to
for x in this_row:
but I see that you use the 'x' index to find column_names[x] so another approach might be better such as
for x in range(len(this_row)):

count numbers associated with category in list

I have a list like this
GroupID,Number
yellow,1
yellow,2
tan,0
blue,1
black,2
black,3
What I want is this
GroupID,Number
yellow,3
tan, 0
blue,1
black,5
So I want to add the numbers associated with each groupID.
This is what I got, but have difficulty with the result statement:
from collections import defaultdict
d = defaultdict(list)
f = open("metal_modules.csv","r")
sheet = f.readlines()
#print sheet
for line in sheet[1:]:
#print line
spl = line.strip().split(",")
#print spl[1]
name = spl[0]
d[name].append(spl[1])
outfile = open("out.txt","w")
result = ""
for v in d.values():
result = #here I need to sum the number in column two for each key in the dictionary#
#print result
outfile.write(result)
f.close()
outfile.close()
keep it simple
result = ""
for group in d:
result += "%s, %s\n" % (group, sum(n for n in d[group]))
You could try the below if the order won't be an important issue for you.
from collections import defaultdict
with open('infile') as f:
d = defaultdict(list)
h = f.readline()
m = f.readlines()
for i in m:
s = i.rstrip().split(',')
d[s[0]].append(s[1])
with open('outfile', 'w') as w:
w.write(h)
for i in d.items():
w.write(i[0]+","+str(sum(map(int,i[1])))+"\n")
Take a look at the following:
with open("metal_modules.csv","r") as f:
sheet = f.readlines()
counter = {}
for line in sheet[1:]:
k,v = line.split(",")
if k in counter:
counter[k] += int(v)
else:
counter[k] = int(v)
with open("out.txt","w") as outfile:
result = "GroupID,Number\n"
for item in counter:
result += "%s,%s\n" % (item,counter[item])
outfile.write(result.strip())

Printing values following comparison of two csv files only if in a specific range using Python 3.3

I'm new at programming and I've got two CSV files that I'm trying to compare. The first file, snp.csv is shown below:
chrom position ref var gene var
1 21421 G T WASH7P snp.LOH
1 1251593 T C CPSF3L snp.somatic
6 107474777 - A PDSS2 indel.somatic
14 106586168 G T ADAM6 snp.LOH
The second file, quad.csv is shown below:
chrom Start End Sequence
1 21420 21437 GGGACGGGGAGGGTTGGG
1 23058 23078 GGGCTGGGGCGGGGGGAGGG
1 23515 23534 GGGAAGGGACAGGGCAGGG
1 45098 45118 GGGAAAGGGCAGGGCCCGGG
3 1148 1173 GGGCCGGGCAAGGCCGGGTGCAGGG
I want to compare these two files and if the two chrom values match, I want to print only those having position value (in snp.csv file) in the range of the start and end value (in the quad.csv file).
So, I am looking for a solution that will give me something like the following (basically the snp.csv file with start, end and sequence value of the quad.csv file)
chrom position ref var gene var Start End Sequence
1 21421 G T WASH7P snp.LOH 21420 21437 GGGACGGGGAGGGTTGGG
I've searched the posts and found some interesting answers that helped me a lot but I’m still experiencing some issues. I’m still learning Python…
Here is my script up to now, I know I have a problem with the range function...I'm stuck
import csv
snp_file = open("snp.csv", "r")
quad_file = open("quad.csv", "r")
out_file = open("results.csv", "wb")
snp = csv.reader(snp_file, delimiter='\t')
quad = csv.reader(quad_file, delimiter='\t')
out = csv.reader(out_file, delimiter='\t')
quadlist = [row for row in quad]
for snp_row in snp:
row = 1
found = False
for quad_row in quadlist:
results_row = snp_row
if snp_row[0] == quad_row[0]:
quad_pos = range(quad_row[1], quad_row[2])
if snp_row[1] in quad_pos:
results_row.append(quad_row)
found = True
break
row = row + 1
if not found:
pass
print (results_row)
snp.close()
quad.close()
out.close()
from bisect import bisect_right
from collections import defaultdict
import csv
TOO_HIGH = 2147483647 # higher than any actual gene position
SNP_FMT = "{0:<7} {1:<11} {2:3} {3:3} {4:11} {5:15}".format
QUAD_FMT = " {1:<7} {2:<7} {3}".format
def line_to_quad(line):
row = line.split()
return int(row[0]), int(row[1]), int(row[2]), row[3]
def line_to_snp(line):
row = line.split()
return int(row[0]), int(row[1]), row[2], row[3], row[4], row[5]
class Quads:
#classmethod
def from_file(cls, fname):
with open(fname, "rU") as inf:
next(inf, None) # skip header line
quads = (line_to_quad(line) for line in inf)
return cls(quads)
def __init__(self, rows):
self.chromosomes = defaultdict(list)
for row in rows:
self.chromosomes[row[0]].append(row[1:])
for segs in self.chromosomes.values():
segs.sort()
def find_match(self, chromosome, position):
segs = self.chromosomes[chromosome]
index = bisect_right(segs, (position, TOO_HIGH, "")) - 1
try:
seg = segs[index]
if seg[0] <= position <= seg[1]:
return (chromosome,) + seg
except IndexError:
pass
def main():
quads = Quads.from_file("quad.csv")
print( # header
SNP_FMT("chrom", "position", "ref", "var", "gene", "var") +
QUAD_FMT("chrom", "Start", "End", "Sequence")
)
with open("snp.csv") as inf:
next(inf, None) # skip header line
for line in inf:
snp = line_to_snp(line)
quad = quads.find_match(snp[0], snp[1])
if quad:
print(SNP_FMT(*snp) + QUAD_FMT(*quad))
if __name__=="__main__":
main()
which gives
chrom position ref var gene var Start End Sequence
1 21421 G T WASH7P snp.LOH 21420 21437 GGGACGGGGAGGGTTGGG

Categories

Resources