How do I organize data in alphabetical order? - python

So I have some pieces of data stored in a folder as .txt, e.g. FRED.txt & BOB.txt, which in the text file contain their 5 random numbers chosen from 1 to 10 and I am stuck as to how I can print their names (in alphabetical order) along with their highest random number. I know that I have to use the glob or os libraries but I don't really know where to go with them.
So far I have this...
import glob, os
dataFile = open("directory_pathway", "r+")
dataFile.read()
# Somehow printing names & highest number here.
dataFile.close()
Any help is much appreciated. Thanks :)

Get only text file from the input directory by glob module.
Use for loop to iterate every text file.
Read file content.
Get max number from the file content.
Add into result dictionary.
Sort dictionary keys and print values.
input: Following contents in FRED.txt file
2
4
6
8
10
code:
import glob
import os
dir_path = "/home/vivek/Desktop/stackoverflow/input"
text_files = glob.glob(dir_path+"/*.txt")
print "Test Files:", text_files
result = {}
for i in text_files:
# Read file content.
with open(i, 'rb') as fp:
data = fp.read()
max_no = max([int(j) for j in data.split()])
result[os.path.basename(i)] = max_no
#- Sort and print File names.
sorted_file_names = sorted(result.keys())
for i in sorted_file_names:
print "File Name: %s, MAx Random Number: %d"%(i, result[i])
output:
Test Files: ['/home/vivek/Desktop/stackoverflow/input/AOB.txt', '/home/vivek/Desktop/stackoverflow/input/ABO.txt', '/home/vivek/Desktop/stackoverflow/input/FRED.txt', '/home/vivek/Desktop/stackoverflow/input/BOB.txt']
File Name: ABO.txt, MAx Random Number: 9
File Name: AOB.txt, MAx Random Number: 9
File Name: BOB.txt, MAx Random Number: 9
File Name: FRED.txt, MAx Random Number: 10
vivek#vivek:~/Desktop/stackoverflow/input$

import glob, os, re
names = []
path = os.path.join('path_to_dir', '*.txt')
for filename in glob.glob(path):
names.append(filename)
names.sort()
for filename in names:
print(re.search(r'\w+.txt', filename).group(0))
text = open(filename, 'r')
data = text.read().split()
print(max(data, key = int), '\n')
text.close()
raw_input()

import os
result_dict = {}
for i in sorted([i for i in os.listdir("/path/to/folder/") if i.endswith(".txt")]):
f = open(i)
a = f.readlines()
num = sorted([int(j.strip()) for j in a])
print num
result_dict[i] = num[-1]
for i,j in sorted(result_dict.items(), key=lambda s: s[0]):
print i,j

sort the file names found with glob, map the contents to int and print the filename f and the max:
import glob
import os
path = "path/"
for f in sorted(glob.glob(os.path.join(path,"*.txt"))):
with open(os.path.join(path, f)) as fl:
print("Filename: {}\nMax value: {}".format(f, max(map(int, fl))))
map returns a map object so we don't need to create a list to find the max, we only store one line/value at a time.

sorted(glob.glob("*.txt")) will get you the list of filenames, sorted. Then iterate over that list, open each file, and print whatever you like.

Related

Loop through pair of files python

I have a script that receives two files as input and creates a dictionary based on lines. Finally, it overwrites the first file.
I am looking for a way to run this script on all file pairs of a folder, choosing as sys.argv[1] and sys.argv[2] based on a pattern in the name.
import re
import sys
datafile = sys.argv[1]
schemaseqs = sys.argv[2]
datafile_lines = []
d = {}
prev = None
with open(datafile, 'r') as f:
i = 0
for line in f:
if i % 2 == 0:
d[line.strip()]=0
prev = line.strip()
else:
d[prev] = line.strip()
i+=1
new_d = {}
with open(schemaseqs, 'r') as f:
i=0
prev = None
for line in f:
if i % 2 == 0:
new_d[line.strip()]=0
prev = line.strip()
else:
new_d[prev] = line.strip()
i+=1
for key, value in d.items():
if value in new_d:
d[key] = new_d[value]
print(d)
with open(datafile,'w') as filee:
for k,v in d.items():
filee.writelines(k)
filee.writelines('\n')
filee.writelines(v)
filee.writelines('\n')
I have hundreds of file pairs all sharing the same pattern proteinXXXX (where XXXX is a number) This number can have up to four digits (e.g. 9,99,999 or 9999). So I have protein 555.txt and protein 555.fasta
I've seen I can use glob or os.listdir to read files from a directory. However, I cannot assign them to a variable and extract the lines one pair at a time in every pair of the directory.
Any help is appreciated.
Just the concept.
Import required libraries.
import glob
import os.path
Define function that extracts only the basename (the part without extension) from filename.
def basename(fn):
return os.path.splitext(os.path.basename(fn))[0]
Create two sets, one with .txt files, another with .fasta files.
t = {basename(fn) for fn in glob.glob("protein*.txt")}
f = {basename(fn) for fn in glob.glob("protein*.fasta")}
Calculate intersection of these two sets to be sure that both .txt and .fasta files exist with the same basename. Then add the missing suffixes and let them process with the existing code.
for bn in t.intersection(f):
process(bn + ".txt", bn + ".fasta")

Wrong output when trying to create a manifest-generating script (python) for Rail-RNA. Files do not match

What I want: To create a "manifest" for running rail-RNA (http://rail.bio/) on a number of fastq files, that I have in a directory, in the following form:
FASTQ URL 1 (tab) optional MD5 1 (tab) FASTQ URL 2 (tab) optional MD5 2 (tab) sample label
like:
/home/data/10080-17_r1.fastq 0 /home/data/10080-17_r2.fastq 0 10080-17_r1
/home/data/10300-25_r1.fastq 0 /home/data/10300-25_r2.fastq 0 10300-25_r1
/home/data/40500-72_r1.fastq 0 /home/data/40500-72_r2.fastq 0 10300-25_r2
.. and so on
What I have done: created a python script to generate a manifest from fastq files in a specific directory:
#!/usr/bin/python
import os
import csv
thisdir = os.getcwd()
# Create empty lists
forward = []
reverse = []
zero = []
names = []
# Extract file-paths for files ending with .fastq and append them to "forward" and "reverse"
for r, d, f in os.walk(thisdir): # r=root, d=directories, f = files
for file in f:
if "_1.fastq" in file:
forward.append(os.path.join(r, file))
if "_2.fastq" in file:
reverse.append(os.path.join(r, file))
# make a list containing 0 with the length of the forward list
for i in range(len(forward)):
zero.append('0')
# extract filenames without extensions:
l = os.listdir(thisdir)
li = [x.split('.')[0] for x in l]
for name in li:
if "_1" in name:
names.append(name)
names = [s.strip('_1') for s in names]
# write the output to a file
with open('manifest.txt', 'w') as f:
writer = csv.writer(f, delimiter='\t')
for path in zip(forward, zero, reverse, zero, names):
writer.writerow(list(path))
What is wrong?: I get a manifest.txt in the right format, BUT, it does not match the right *_r1.fastq and *_r2.fastq files. It does something like this (the r1's in the first column does not match the r2's of the third column)
/home/data/10080-17_r1.fastq 0 /home/data/40500-72_r2.fastq 0 10080-17_r1
/home/data/10300-25_r1.fastq 0 /home/data/10080-17_r2.fastq 0 10300-25_r1
/home/data/40500-72_r1.fastq 0 /home/data/10300-25_r2.fastq 0 10300-25_r2
Do some of you, more experienced Python'ers have a solution to fix this?
That would be greatly appreciated!
Best wishes, Birgitte
In provided solution this error occurs if amount of *_r1.fastq files doesn't correspond to amount of *_r2.fastq files as that code makes new csv rows only by arrays indexes and doesn't compare file names.
I updated that solution. Check the files names, they are should be like:
/home/data/10080-17_r1.fastq
/home/data/10080-17_r2.fastq
At the moment we get all forward files ( *_r1.fastq ) and we are trying to find an appropriate reverse file ( *_r2.fastq) in the same directory. If we doesn't find it then put '-' instead of the reverse file's name.
Please check the code and read comments:
#!/usr/bin/python
import os
import csv
this_dir = os.getcwd()
forward_arr = []
reverse_arr = []
for r, d, f in os.walk(this_dir): # r=root, d=directories, f = files
for file in f:
if "_r1.fastq" in file:
forward_arr.append(os.path.join(r, file))
if "_r2.fastq" in file:
reverse_arr.append(os.path.join(r, file))
# collect result rows in this array
csv_arr = []
# foreach file in forward_arr
for forward_file in forward_arr:
# get sample label from full file path
# 1. split by '/' and select last element:
# /home/data/10080-17_r1.fastq -> 10080-17_r1.fastq
# 2. split by '_r' and select first element: 10080-17_r1.fastq -> 10080-17
sample_label = forward_file.split('/')[-1].split('_r')[0]
# we will search the reverse file for the same forward file in the reverse_arr
# but if we don't find it, in that case we'll put '-'
# instead of the path to the reverse file
reverse_file_result = "-"
# we are looking for a file with the same name and in the same location
# but it should be a reverse file with '_r2' instead of '_r1' in its name
reverse_file_for_search = forward_file.replace("_r1", "_r2")
# search that reverse_file in the reverse_arr
for reverse_file in reverse_arr:
# if we found that file
if reverse_file_for_search == reverse_file:
# assign the reverse file name
# to reverse_file_result variable insted of '-'
reverse_file_result = reverse_file
# go to the next forward_file
break
# in that place we can count md5 for the FORWARD file
md5_url_1 = 0
# in that place we can count md5 for the REVERSE file
md5_url_2 = 0
# append the result row in the csv_arr
csv_arr.append((forward_file, md5_url_1, reverse_file_result,
md5_url_2, sample_label))
# re-write all data to csv file per one iteration
with open('manifest.txt', 'w') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerows(csv_arr)
I think this should work for what you need. It's difficult to make out, because this:
names = [s.strip('_1') for s in names]
doesn't look like it should be doing anything (I suspect it's supposed to be "_r1" as in the first loop where I've modified there)
import os
import csv
thisdir = os.getcwd()
# Create empty lists
forward = []
reverse = []
names = []
for r, d, f in os.walk(thisdir): # r=root, d=directories, f = files
if f.endswith("_r1.fastq"):
forward.append(os.path.join(r, file))
names.append(f.strip("_r1.fastq"))
elif f.endswith("_r2.fastq"):
reverse.append(os.path.join(r, file))
# write the output to a file
with open('manifest.txt', 'w') as f:
writer = csv.writer(f, delimiter='\t')
for for, rev, nam in zip(forward, reverse, names):
path = [for, 0, rev, o, nam]
writer.writerow(path)

Python Line Comparison

I have a directory with subdirectories full of files that all contain the line :
VERSION "1.0" # THAT NUMBER VARIES.
So one file as an example of the content would read :
#comments about what the file program does
#
#
#ifndefine _AWS_THIS_READS_STUFF_H
#define _AWS_THIS_READS_STUFF_H
#define AWS_THIS_READS_STUFF_VERSION "1.2" <---this is the line I want
#to compare that is in all
Then the read of the file would be some long program in C that is written.
I used the number to identify when I have made changes. I need to make something in Python that identifies if the numbers before the decimal and after the decimal all match within a directory, since there are hundreds of files.
import glob
import sys
import re
import filecmp
from collections import defaultdict
from itertools import dropwhile
myfiles = glob.glob('*.h')
for filename in myfiles:
print(filename) #this prints all of the file names
def all_same(patt):
r = re.compile('#define\s+([_A-Z]+)_VERSION\s+("\d+\.\d+")$') #REGEX to find
files = glob.glob(patt)
d = defaultdict(list)
for file in files:
with open(file) as f:
version = r.search(next(dropwhile(lambda x: "VERSION" not in x, f)))
d[version].append(file) #search for VERSION LINE
return d
and when it is run in my command prompt it prints nothing out.
ALSO TRIED THIS! - I got it to print out the file and the "0.0" number but now I need to compare!
myfiles = glob.glob('*.h') #reads in all files ending in .h
for file in myfiles:
for line in open(file):
line = line.rstrip()
if re.search('VERSION\s+("\d+\.\d+")$', line):
list = re.findall("\d+\.\d+" , line)
list.append(file)
print(list)
#print (list + ' : ' + root + "/" + myfile)
with open(file) as f:
version = re.findall('VERSION\s+("\d+\.\d+")$', file)
version = re.search(next(dropwhile(lambda x: "VERSION" not in x, f)))
print(version)
Just need to figure out how to compare numbers in the list of "0.0" - Once again, before decimal and after)
If your version line is header of each file you could pull the first returned from iglob and compare that line to the rest with all:
import glob
import re
from collections import defaultdict
def all_same(patt):
r = re.compile("VERSION\s+(\d+\.\d+)")
files = glob.iglob(patt)
d = defaultdict(list)
for file in files:
with open(file) as f:
version = r.search(next(dropwhile(lambda x: "VERSION" not in x, f)))
d[version].append(file)
return d
If you actually want to the names of all the files with different version numbers, I would find the VERSION line in each file and group by the version number in a dict using the version number as the key and appending the file names as the values:
import glob
import re
from collections import defaultdict
from itertools import dropwhile
def all_same(path):
r = re.compile("VERSION\s+(\d+\.\d+)")
files = glob.iglob(path)
d = defaultdict(list)
for file in files:
with open(file) as f:
version = r.search(next(dropwhile(lambda x: "VERSION" not in x, f))).group()
d[version].append(file)
return d

Saving Filenames with Condition

I'm trying to save the names of files that fulfill a certain condition.
I think the easiest way to do this would make a short Python program that imports and reads the files, checks if the condition is met, and (assuming it is met) then saves the names of the files.
I have data files with just two columns and four rows, something like this:
a: 5
b: 5
c: 6
de: 7
I want to save the names of the files (or part of the name of the files, if that's a simple fix, otherwise I can just sed the file afterwards) of the data files that have the 4th number ([3:1]) greater than 8. I tried importing the files with numpy, but it said it couldn't import the letters in the first column.
Another way I was considering trying to do it was from the command line something along the lines of cat *.dat >> something.txtbut I couldn't figure out how to do that.
The code I've tried to write up to get this to work is:
import fileinput
import glob
import numpy as np
#Filter to find value > 8
#Globbing value datafiles
file_list = glob.glob("/path/to/*.dat")
#Creating output file containing
f = open('list.txt', 'w')
#Looping over files
for file in file_list:
#For each file in the directory, isolating the filename
filename = file.split('/')[-1]
#Opening the files, checking if value is greater than 8
a = np.loadtxt("file", delimiter=' ', usecols=1)
if a[3:0] > 8:
print >> f, filename
f.close()
When I do this, I get an error that says TypeError: 'int' object is not iterable, but I don't know what that's referring to.
I ended up using
import fileinput
import glob
import numpy as np
#Filter to find value > 8
#Globbing datafiles
file_list = glob.glob("/path/to/*.dat")
#Creating output file containing
f = open('list.txt', 'w')
#Looping over files
for file in file_list:
#For each file in the directory, isolating the filename
filename = file.split('/')[-1]
#Opening the files, checking if value is greater than 8
a = np.genfromtxt(file)
if a[3,1] > 8:
f.write(filename + "\n")
f.close()
it is hard to tell exactly what you want but maybe something like this
from glob import glob
from re import findall
fpattern = "/path/to/*.dat"
def test(fname):
with open(fname) as f:
try:
return int(findall("\d+",f.read())[3])>8
except IndexError:
pass
matches = [fname for fname in glob(fpattern) if test(fname)]
print matches

use a function on every item in a list python

Hello I am trying to build a tool that will compress a list of folders and rename the compressed file, this list of the names of folders I want to compress are located in a .txt file, the .txt is something like this:
james, 5005
kyle, 02939
Betty, 40234
I have used multiple methods to try and build this code but I keep getting a python error set object is not subscriptable and I have no idea what to do to rectify this and on how to continue from here. Can I not use shutil.make_archive with dictionaries or can I use lists? because I would like to run this function down the first column and to rename the files i am creating using the second column. I am using python 3, and any help would be great!
import os
import shutil
x = input("Input Path your user list: ")
filename = input("Input user file name: ")
changedir = input("Input where your folders are: ")
os.chdir(changedir)
userfile = x + filename + ".txt"
print("Awesome your path is now", userfile)
with open(userfile, "rt") as userfileone:
count = 0
while 1:
buffer = userfileone.read(8192*1024)
if not buffer: break
count += buffer.count('\n')
print("It is indicated that there are", count + 1, "in the file")
with open(userfile, "rt") as f:
lines = f.readlines()
dic = {}
for x in lines:
x = x.strip().split(',')
dic[x[0]]=tuple(x[1:])
for i in dic:
shutil.make_archive(i, "zip", dic[i])
It seems like you are looking for the map function.

Categories

Resources