list filtering in python

list filtering in python - python

my program will import a file created by user having maximum of 15 lines of text in it(examples below)
#ID #value1 # value2
445 4 9000
787 2 4525
405 4 3352
415 1 2854
455 2 5500
r
The program then filter all the lines that have #value 1 larger than 2 and #value 2 larger than 3000 and print out their #ID, ignoring the rest.
here's what i have done so far
filename = ('input.txt')
infile = open(filename, 'r')
list_id = []
list_value1 = []
list_value2 = []
masterlist = []
for line in infile:
id, value1, value2 = line.split()
list_id.append(id)
list_value1.append(value1)
list_value2.append(value2)
masterlist.append(list_id)
masterlist.append(list_value1)
masterlist.append(list_value2)
#filtering part
sort = [i for i in masterlist[1] if i > 2 ] and [p for p in masterlist[2] if p > 3000]
#do something here to print out the ID of the filtered lines

Using your code as a starting point:
filename = ('input.txt')
infile = open(filename, 'r')
ids = []
for line in infile:
id, value1, value2 = line.split()
if int(value1) > 2 and int(value2) > 3000:
ids.append(id)
Put in exception handling for non-integer values as needed.

Related

Python making matching pairs

I've got the .txt file like this within:
Crista
Jame
7,3
2,0
Wiki
Rok
4,1
6,2
3,2
6,8
Pope
Lokk
5,2
0,1
3,1
Sam
Antony
4,3
9,1
My code to find all names and append them to the names[] list, and to find all digits and append them to the digits[] list (if there are more than two lines with digits in a row I didn't need them in the list before):
import re
f=open('mine.txt')
names=[]
digits=[]
count=0
for line in f:
line = line.rstrip()
if re.search('^[a-zA-Z]', line):
name=line
names.append(name)
if re.findall('^\d{1}:\d{1}', line):
if count < 2 :
digit=line
digits.append(digit)
count += 1
elif line != "" :
count = 0
Then I made pairs for matching names and digits:
my_pairs_dig=list()
while(digits):
a = digits.pop(0); b = digits.pop(0)
my_pairs_dig.append((a,b))
my_pairs_dig
my_pairs_names = list()
while(names):
a = names.pop(0); b = names.pop(0)
my_pairs_names.append((a,b))
my_pairs_names
outp=list(zip(my_pairs_names,my_pairs_dig))
And got this output:
[(('Crista', 'Jame'), ('7,3', '2,0')), (('Wiki', 'Rok'), ('4,1', '6,2')), (('Pope', 'Lokk'), ('5,2', '0,1')), (('Sam', 'Antony'),('4,3', '9,1'))]
But plans were changed and now my desired outout is:
[(('Crista', 'Jame'), ('7,3', '2,0')), (('Wiki', 'Rok'), ('4,1', '6,2'), ('3,2', '6,8')), (('Pope', 'Lokk'), ('5,2', '0,1'), ('3,1')), (('Sam', 'Antony'),('4,3', '9,1'))]
How can I rewrite my code to got the desired outoput?

Try this
with open('test.txt', 'r') as fp:
data = fp.read().split("\n")
i, res = 0, []
while i < len(data):
if data[i].isalpha():
names = (data[i], data[i+1])
i += 2
digits = []
while i < len(data) and not data[i].isalpha():
digits.append(data[i])
i += 1
digits = tuple(digits)
if len(digits) > 2:
res.append((names, digits[: 2], digits[2: ]))
else:
res.append((names, digits[: 2]))
print(res)
Output:
[(('Crista', 'Jame'), ('7,3', '2,0')), (('Wiki', 'Rok'), ('4,1', '6,2'), ('3,2', '6,8')), (('Pope', 'Lokk'), ('5,2', '0,1'), ('3,1',)), (('Sam', 'Antony'), ('4,3', '9,1'))]

Try this:
import re
digits=[]
result = []
name1, name2 = None, None
for line in f:
if line:
line = line.rstrip()
if re.search('^[a-zA-Z]', line):
if name1 and name2:
result.append(((name1, name2), *tuple(tuple(digits[i:i+2]) for i in range(0, len(digits), 2))))
name1, name2, digits = None, None, []
if name1:
name2 = line
else:
name1 = line
else:
digits.append(line)
if name1 and name2:
result.append(((name1, name2), *tuple(tuple(digits[i:i+2]) for i in range(0, len(digits), 2))))
name1, name2, digits = None, None, []
print(result)
Output:
[(('Crista', 'Jame'), ('7,3', '2,0')), (('Wiki', 'Rok'), ('4,1', '6,2'), ('3,2', '6,8')), (('Pope', 'Lokk'), ('5,2', '0,1'), ('3,1',)), (('Sam', 'Antony'), ('4,3', '9,1'))]
This is based on your assumption:
that's always two names and then 2,3 or 4 lines with numbers

Lists in dictionary

I have a text file of format like this
10:45 a b c
x 0 1 2
y 4 5 6
z 7 8 9
I want to make x as key and 0,1,2 its value in the list.Same with y and z
while(1):
line = f.readline()
if time in line:
print (line)
L1.append(line)
for count,line in enumerate(f):
if (i < 3):
L1.append(line)
print ("Line{} : {}".format(count,line.strip()))
i=i+1
#print(L1)
for k in range(1):
print(L1[k])
test1 = L1[k]
a1 = test1.split()
print (a1[1])
dict = {a1[1]: L1[k] for a1[1] in L1[k]}
print (dict)
for k in range(1,3):
#print("hey")
print (L1[k]) #will list the single row
test = L1[k]
#print(test)
a = test.split()
print (a[0])
dict = {a[0]:L1[k] for a[0] in L1[k]}
print (dict)
Any idea what i am doing wrong here?
P.S. - I am new to python

You could try this:
my_dict = {}
lines = f.readLines()
lines.pop(0)
for line in lines:
line_list = line.split(' ')
key = line_list.pop(0)
my_dict.update({key: line_list})

This will accomplish what it is I think you need (assuming your text file is stored in the same directory and you replace 'test.txt' with your filename):
with open('test.txt', 'r') as values:
contents = values.read().strip().split()
new_dict = {}
i = 4
while i <= len(contents)-4:
new_dict.update({contents[i]: contents[i+1:i+4]})
i += 4
print(new_dict)
or this, if you want the values as integers:
with open('test.txt', 'r') as values:
contents = values.read().strip().split()
new_dict = {}
i = 4
while i <= len(contents)-4:
new_dict.update({contents[i]: [int(contents[i+1]),int(contents[i+2]),int(contents[i+3])]})
i += 4
print(new_dict)

Try this
import string
start_found = False
result_dict = {}
for line in open("stack1_input.txt", mode='r'):
if (line.startswith("10:45")):
start_found = True
continue
if start_found == True:
values = line.split()
if len(values) == 4:
result_dict[values[0]] = values[1:]
print (result_dict)

Fastest way to create dataframe from hundreds csv files

I have 150 csv files with two column (time and site). I want to read each file, creating frequency dictionary ({'site':[site_number, number of occurrences site]}) and creating DataFrame consists of 11 columns (user_id, site1, site2, ...site10), user_id parsing from name of file (../user0001.csv). Each row in DataFrame unique session of 10 site visits. My codes worked on 150 files 150 seconds (its terrible). How can i improve it?
def prepare_3(path_to_csv_files, session_length=10):
word_freq = {}
freq_dict = {}
word_count = 0
row = []
columns = []
columns.append('user_id')
columns.extend(['site' + str(i) for i in range(1, session_length+1)])
lst_files = sorted(glob(path_to_csv_files))
for csv in lst_files:
user = int(csv[csv.find('.')-4:csv.find('.')])
frame = []
frame.append(user)
site_count = 0
with open(csv, 'r') as f:
f.readline()
for line in f:
site = line[line.find(',') + 1:].rstrip()
site_count += 1
if site in word_freq:
word_freq[site][1] += 1
else:
word_count += 1
word_freq[site] = [word_count, 1]
if site_count > session_length:
site_count = 1
row.append(frame)
frame = []
frame.append(user)
frame.append(word_freq[site][0])
else:
frame.append(word_freq[site][0])
row.append(frame)
df = pd.DataFrame(data=row, columns=columns, dtype=int)
df.fillna(0 ,inplace=True)
return df, word_freq

counting the amount of elements in a nested list using a dictionary

i have this program that reads data from a file and then turns it into a nested list and im trying to count the amount of elements in it using a dictionary if possible and output something like this
I have L which is a list and the program should check if the elements are in that list and if so count them. which i haven't been able to figure out how to do
def count(fnm,L):
try:
fp = open(fnm,'r')
count ={}
table = []
l_temp = []
for line in fp:
line = line.strip()
l_temp.append(line)
fp.close()
i = 0
while i < len(l_temp):
last = l_temp[i].split(' ')
table.append(last)
i += 1
return table
except IOError:
print("Error! no such file")
def main():
L = ['CSE', '1310', '1104', '1105']
fnm = 'task2_test1.txt'
q = count(fnm,L)
print(q)
main()
output:
[['5333', '1105'], ['CSE', '1310', 'CSE', '1104'], ['CSE', '2325']]
3 : CSE 1310 CSE 1104
1 : 5333 1105
1 : CSE 2325

Printing values following comparison of two csv files only if in a specific range using Python 3.3

I'm new at programming and I've got two CSV files that I'm trying to compare. The first file, snp.csv is shown below:
chrom position ref var gene var
1 21421 G T WASH7P snp.LOH
1 1251593 T C CPSF3L snp.somatic
6 107474777 - A PDSS2 indel.somatic
14 106586168 G T ADAM6 snp.LOH
The second file, quad.csv is shown below:
chrom Start End Sequence
1 21420 21437 GGGACGGGGAGGGTTGGG
1 23058 23078 GGGCTGGGGCGGGGGGAGGG
1 23515 23534 GGGAAGGGACAGGGCAGGG
1 45098 45118 GGGAAAGGGCAGGGCCCGGG
3 1148 1173 GGGCCGGGCAAGGCCGGGTGCAGGG
I want to compare these two files and if the two chrom values match, I want to print only those having position value (in snp.csv file) in the range of the start and end value (in the quad.csv file).
So, I am looking for a solution that will give me something like the following (basically the snp.csv file with start, end and sequence value of the quad.csv file)
chrom position ref var gene var Start End Sequence
1 21421 G T WASH7P snp.LOH 21420 21437 GGGACGGGGAGGGTTGGG
I've searched the posts and found some interesting answers that helped me a lot but I’m still experiencing some issues. I’m still learning Python…
Here is my script up to now, I know I have a problem with the range function...I'm stuck
import csv
snp_file = open("snp.csv", "r")
quad_file = open("quad.csv", "r")
out_file = open("results.csv", "wb")
snp = csv.reader(snp_file, delimiter='\t')
quad = csv.reader(quad_file, delimiter='\t')
out = csv.reader(out_file, delimiter='\t')
quadlist = [row for row in quad]
for snp_row in snp:
row = 1
found = False
for quad_row in quadlist:
results_row = snp_row
if snp_row[0] == quad_row[0]:
quad_pos = range(quad_row[1], quad_row[2])
if snp_row[1] in quad_pos:
results_row.append(quad_row)
found = True
break
row = row + 1
if not found:
pass
print (results_row)
snp.close()
quad.close()
out.close()

from bisect import bisect_right
from collections import defaultdict
import csv
TOO_HIGH = 2147483647 # higher than any actual gene position
SNP_FMT = "{0:<7} {1:<11} {2:3} {3:3} {4:11} {5:15}".format
QUAD_FMT = " {1:<7} {2:<7} {3}".format
def line_to_quad(line):
row = line.split()
return int(row[0]), int(row[1]), int(row[2]), row[3]
def line_to_snp(line):
row = line.split()
return int(row[0]), int(row[1]), row[2], row[3], row[4], row[5]
class Quads:
#classmethod
def from_file(cls, fname):
with open(fname, "rU") as inf:
next(inf, None) # skip header line
quads = (line_to_quad(line) for line in inf)
return cls(quads)
def __init__(self, rows):
self.chromosomes = defaultdict(list)
for row in rows:
self.chromosomes[row[0]].append(row[1:])
for segs in self.chromosomes.values():
segs.sort()
def find_match(self, chromosome, position):
segs = self.chromosomes[chromosome]
index = bisect_right(segs, (position, TOO_HIGH, "")) - 1
try:
seg = segs[index]
if seg[0] <= position <= seg[1]:
return (chromosome,) + seg
except IndexError:
pass
def main():
quads = Quads.from_file("quad.csv")
print( # header
SNP_FMT("chrom", "position", "ref", "var", "gene", "var") +
QUAD_FMT("chrom", "Start", "End", "Sequence")
)
with open("snp.csv") as inf:
next(inf, None) # skip header line
for line in inf:
snp = line_to_snp(line)
quad = quads.find_match(snp[0], snp[1])
if quad:
print(SNP_FMT(*snp) + QUAD_FMT(*quad))
if __name__=="__main__":
main()
which gives
chrom position ref var gene var Start End Sequence
1 21421 G T WASH7P snp.LOH 21420 21437 GGGACGGGGAGGGTTGGG

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

list filtering in python - python

Using your code as a starting point: filename = ('input.txt') infile = open(filename, 'r') ids = [] for line in infile: id, value1, value2 = line.split() if int(value1) > 2 and int(value2) > 3000: ids.append(id) Put in exception handling for non-integer values as needed.

Related

Python making matching pairs

Lists in dictionary

Fastest way to create dataframe from hundreds csv files

counting the amount of elements in a nested list using a dictionary

Printing values following comparison of two csv files only if in a specific range using Python 3.3

Categories

Resources