Flattening JSON Files for Pizzly Fusion Tables - python

I am trying to flatten JSON files that come out of the Kallisto -> Pizzly pipeline. The GitHub page provides a python script flatten_json.py (see below). I am still a beginner at python, and haven't been able to figure out how to run this script effectively to generate a gene table. I cannot seem to find any tutorials or vignettes online. Any guidance would be very helpful.
import sys
import json
from collections import OrderedDict
####
# gene1_name gene1_id, gene2_name, gene2_id, type, pair, split, txlist
def loadJSON(fn):
with open(fn) as f:
JJ = json.load(f,object_pairs_hook=OrderedDict)
return JJ['genes']
def outputGeneTable(fusions, outf, filters = None):
outf.write('\t'.join("geneA.name geneA.id geneB.name geneB.id paircount splitcount transcripts.list".split()))
outf.write('\n')
for gf in fusions:
gAname = gf['geneA']['name']
gAid = gf['geneA']['id']
gBname = gf['geneB']['name']
gBid = gf['geneB']['id']
pairs = str(gf['paircount'])
split = str(gf['splitcount'])
txp = [tp['fasta_record'] for tp in gf['transcripts']]
outf.write('\t'.join([gAname, gAid, gBname, gBid, pairs, split, ';'.join(txp)]))
outf.write('\n')
def usage():
print("Usage: python flatten_json.py fusion.out.json [genetable.txt]")
print("")
print(" outputs a flat table listing all gene fusions, if the output file is not")
print(" specified it prints to standard output")
if __name__ == "__main__":
nargs = len(sys.argv)
if nargs <= 1:
usage()
else:
infn = sys.argv[1]
fusions = loadJSON(infn)
outf = sys.stdout
if nargs == 3:
outf = open(sys.argv[2],'w')
outputGeneTable(fusions,outf)
if outf != sys.stdout:
outf.close()

Related

How to optimize my FASTA parser Python script in order to make it runs faster on slurm?

I hope I post on the right place ?
My script is running fine on little genomes but it take hours and days when it comes to work with mammal genomes. I tried many different things but Im out of idea. Can you tell me what cause this script to be so slow ? This script parse fasta file, it works with ID beginning with a > or not. It also calculates somes genomics metrics like N50 and L50 and finally it return a dict with seq IDs and their length.
Thank you very much!
from collections import OrderedDict
import argparse
parser = argparse.ArgumentParser(description = "N50 parser")
#parser = argparse.ArgumentParser(prog="N50Parser.py", usage="N50")
parser.add_argument("-i", "--input", action="store", dest="input", required=True,
help="Input file with sequences")
parser.add_argument("-o", "--output", action="store", dest="output",
help="output file")
parser.add_argument("-o2", "--output2", action="store", dest="output2",
help="output file")
args = parser.parse_args()
def read_file(fasta_file):
"Parse fasta file"
descriptiondict = OrderedDict()
dictionnary = OrderedDict()
with open(fasta_file, 'r') as infile:
for line in infile:
record = line.strip()
if record and record[0] == '>':
seqid = record.split(" ")[0][1:]
print(seqid)
dictionnary[seqid] = ""
toto = record.split(" ", 1)
if len(toto) >= 2 :
description = toto[1]
descriptiondict[seqid] = description
else:
descriptiondict[seqid] = ""
continue
dictionnary[seqid] += record
return dictionnary, descriptiondict
seqdict, descriptdict = read_file(args.input)
lengthdict = OrderedDict()
for sequenceid in seqdict:
lengthdict[sequenceid] = len(seqdict[sequenceid])
length = sum(lengthdict.values())
N_number = sum([seqdict[seqid].count("N") for seqid in seqdict])
print(N_number)
print(length)
all_len = sorted(lengthdict.values(), reverse=True)
print(all_len)
if length > 0:
acum = 0
for y in range (len(all_len)):
if acum <= length / 2:
acum = all_len[y] + acum
n = y #L50
else:
break
n = n + 1
print("The L50 is", n)
print("The N50 is", all_len[n-1])
with open(args.output, 'w') as outfile:
outfile.write("L50\t{}\n".format(n))
outfile.write("N50\t{}\n".format(all_len[n-1]))
with open(args.output2, "w") as file:
for key, value in lengthdict.items():
file.write(f"{key} : {value}\n")
Don't reinvent the wheel. Use well-established open source libraries for common tasks. In this case, use Biopython to parse FASTA files, for example:
from Bio import SeqIO
for seq_record in SeqIO.parse("ls_orchid.fasta", "fasta"):
print(seq_record.id)
print(repr(seq_record.seq))
print(len(seq_record))
Install Biopython using pip or conda:
conda create --channel bioconda --name biopython biopython

Hashing wordlist with big input output files in Python 3.8

I'm a beginner in coding and am trying to build a script that takes a txt file as an input, hash it and output to another txt file containing "string:hashedstring" in each line of it. The code is working properly. The problem I am facing now is that if the input file is big, it will consume all RAM and kill it. I tried to use chunks, but couldn't figure out how to use it with multiline input and output.
Any suggestions regarding other parts of the code other than the main subject here is very welcome, since I am just starting on this. Thanks.
import argparse
import hashlib
import os
import sys
def sofia_hash(msg):
h = ""
m = hashlib.md5()
m.update(msg.encode('utf-8'))
msg_md5 = m.digest()
for i in range(8):
n = (msg_md5[2*i] + msg_md5[2*i+1]) % 0x3e
if n > 9:
if n > 35:
n += 61
else:
n += 55
else:
n += 0x30
h += chr(n)
return h
top_parser = argparse.ArgumentParser(description='Sofiamass')
top_parser.add_argument('input', action="store", type=argparse.FileType('r', encoding='utf8'), help="Set input file")
top_parser.add_argument('output', action="store", help="Set output file")
args = top_parser.parse_args()
sofiainput = args.input.read().splitlines()
a = 0
try:
while a < len(sofiainput):
target_sofiainput = sofiainput[a]
etarget_sofiainput = (target_sofiainput).encode('utf-8')
try:
sofia_pass = sofia_hash(target_sofiainput)
x = True
except KeyboardInterrupt:
print ("\n[---]exiting now[---]")
if x == True:
with open(args.output, 'a') as sofiaoutput:
sofiaoutput.write(str(target_sofiainput) + ":" + str(sofia_pass) + "\n")
elif x == False:
print('error')
a += 1
except KeyboardInterrupt:
print ("\n[---]exiting now[---]")
except AttributeError:
pass
When you open the file with the open command, it creates a object called file handler. So, when you do:
with open('filepath.txt', 'r') as f:
for line in f:
print(line)
it only keeps the current line you are using in the RAM, thus achieving your objective to use as little as RAM as possible.

Python - how to optimize iterator in file parsing

I get files that have NTFS audit permissions and I'm using Python to parse them. The raw CSV files list the path and then which groups have which access, such as this type of pattern:
E:\DIR A, CREATOR OWNER FullControl
E:\DIR A, Sales FullControl
E:\DIR A, HR Full Control
E:\DIR A\SUBDIR, Sales FullControl
E:\DIR A\SUBDIR, HR FullControl
My code parses the file to output this:
File Access for: E:\DIR A
CREATOR OWNER,FullControl
Sales,FullControl
HR,FullControl
File Access For: E:\DIR A\SUBDIR
Sales,FullControl
HR,FullControl
I'm new to generators but I'd like to use them to optimize my code. Nothing I've tried seems to work, so here is the original code (I know it's ugly). It works but it's very slow. The only way I can do this is by parsing out the paths first, put them in a list, make a set so that they're unique, then iterate over that list and match them with the path in the second list, and list all of the items it finds. Like I said, it's ugly but works.
import os, codecs, sys
reload(sys)
sys.setdefaultencoding('utf8') // to prevent cp-932 errors on screen
file = "aud.csv"
outfile = "access-2.csv"
filelist = []
accesslist = []
with codecs.open(file,"r",'utf-8-sig') as infile:
for line in infile:
newline = line.split(',')
folder = newline[0].replace("\"","")
user = newline[1].replace("\"","")
filelist.append(folder)
accesslist.append(folder+","+user)
newfl = sorted(set(filelist))
def makeFile():
print "Starting, please wait"
for i in range(1,len(newfl)):
searchItem = str(newfl[i])
with codecs.open(outfile,"a",'utf-8-sig') as output:
outtext = ("\r\nFile access for: "+ searchItem + "\r\n")
output.write(outtext)
for item in accesslist:
searchBreak = item.split(",")
searchTarg = searchBreak[0]
if searchItem == searchTarg:
searchBreaknew = searchBreak[1].replace("FSA-INC01S\\","")
searchBreaknew = str(searchBreaknew)
# print(searchBreaknew)
searchBreaknew = searchBreaknew.replace(" ",",")
searchBreaknew = searchBreaknew.replace("CREATOR,OWNER","CREATOR OWNER")
output.write(searchBreaknew)
How should I optimize this?
EDIT:
Here is an edited version. It works MUCH faster, though I'm sure it can still be fixed:
import os, codecs, sys, csv
reload(sys)
sys.setdefaultencoding('utf8')
file = "aud.csv"
outfile = "access-3.csv"
filelist = []
accesslist = []
with codecs.open(file,"r",'utf-8-sig') as csvinfile:
auditfile = csv.reader(csvinfile, delimiter=",")
for line in auditfile:
folder = line[0]
user = line[1].replace("FSA-INC01S\\","")
filelist.append(folder)
accesslist.append(folder+","+user)
newfl = sorted(set(filelist))
def makeFile():
print "Starting, please wait"
for i in xrange(1,len(newfl)):
searchItem = str(newfl[i])
outtext = ("\r\nFile access for: "+ searchItem + "\r\n")
accessUserlist = ""
for item in accesslist:
searchBreak = item.split(",")
if searchItem == searchBreak[0]:
searchBreaknew = str(searchBreak[1]).replace(" ",",")
searchBreaknew = searchBreaknew.replace("R,O","R O")
accessUserlist += searchBreaknew+"\r\n"
with codecs.open(outfile,"a",'utf-8-sig') as output:
output.write(outtext)
output.write(accessUserlist)
I'm misguided from your used .csv file extension.
Your given expected output isn't compatible with csv, as inside a record no \n possible.
Proposal using a generator returning record by record:
class Audit(object):
def __init__(self, fieldnames):
self.fieldnames = fieldnames
self.__access = {}
def append(self, row):
folder = row[self.fieldnames[0]]
access = row[self.fieldnames[1]].strip(' ')
access = access.replace("FSA-INC01S\\", "")
access = access.split(' ')
if len(access) == 3:
if access[0] == 'CREATOR':
access[0] += ' ' + access[1]
del access[1];
elif access[1] == 'Full':
access[1] += ' ' + access[2]
del access[2];
if folder not in self.__access:
self.__access[folder] = []
self.__access[folder].append(access)
# Generator for class Audit
def __iter__(self):
record = ''
for folder in sorted(self.__access):
record = folder + '\n'
for access in self.__access[folder]:
record += '%s\n' % (','.join(access) )
yield record + '\n'
How to use it:
def main():
import io, csv
audit = Audit(['Folder', 'Accesslist'])
with io.open(file, "r", encoding='utf-8') as csc_in:
for row in csv.DictReader(csc_in, delimiter=","):
audit.append(row)
with io.open(outfile, 'w', newline='', encoding='utf-8') as txt_out:
for record in audit:
txt_out.write(record)
Tested with Python:3.4.2 - csv:1.0

Python2.7 - Passing argument to command line to filter CSV according to parameter

I don't have code yet for this because I am not sure how to tackle.
Using the tfidfVectorizer in the sci-kit in python, I have calculated the tfidf score for terms used in Movie Reviews.
I output my results into a CSV file with 4 columns that contain
Col1 = indDocID
Col2 = Word
Col3 = MovieID
Col4 = Score
I would like to pass a command line argument that would parse the CSV file and print only those results that correspond to MovieID.
In that way, directly from the command line, I would be able to call the results that I am interested in as files $python tfidf.py -i uniqMovieID
I have been reading up on all of the literature regarding passing a command line argument and have been experimenting with my code, but they don't seem to address the problem that I have of filtering the information that contains the same MovieID that I indicate directly in the command line.
An example of the data in the CSV file is
indDocID,Word,MovieID,Score
1,love,4583B,.09
2,good,4583B,.37
3,funny,4583B,.64
4,love,34623C,.34
5,hate,34623C,.57
My goal is to run the script with the following command python tfidf.py -i 4583B so that the result is:
indDocID,Word,MovieID,Score
1,love,4583B,.09
2,good,4583B,.37
3,funny,4583B,.64
I have been looking here and I have tried certain examples such as
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-i", "--idMovie", dest="arg",
help="insertMovieIds", metavar="variable")
as well as this:
def main(argv):
movieIds = ''
try:
opts, args = getopt.getopt(argv,"hi:",["movieid="])
except getopt.GetoptError:
print 'test.py -i <movieID>'
sys.exit(1)
for opt, arg in opts:
if opt == '-h':
print 'tfidf.py -i <movieIds>'
sys.exit()
elif opt in ("-i", "--id"):
movieIds = arg
print 'MovieID is:', movieIds
if __name__ == "__main__":
main(sys.argv[1:])
Yet I am not sure how to successfuly integrate them into my code so that it is associated to a specific row in my CSV to arrive at the above desired result.
Use a csv.DictReader pulling any rows that match the unique MovieID passed in to your script:
import csv
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-i", "--idMovie", dest="arg",
help="insertMovieIds", metavar="variable")
opt, args = parser.parse_args()
un_id = opt.arg
with open("out.txt") as f:
names = next(f).rstrip().split(",")
r = csv.DictReader(f, fieldnames=names)
print(",".join(names))
for d in r:
if un_id == d["MovieID"]:
print(",".join([d[n] for n in names]))
Output:
$ python test.py -i 4583B
indDocID,Word,MovieID,Score
1,love,4583B,.09
2,good,4583B,.37
3,funny,4583B,.64
You need to add your error checking back in etc.. if you want to reuse the code for different cols just put the logic a function, you also probably only want to print the header if you find a match and if no match is found report that to the user:
def find_by_val(col, k, fle, delim=","):
with open(fle) as f:
names = next(f).rstrip().split(delim)
if col not in names:
print("Column does not exist.")
return
r, found = csv.DictReader(f, fieldnames=names), False
for d in r:
if un_id == d[col]:
if not found:
print(",".join(names))
print(",".join([d[n] for n in names]))
found = True
else:
print(",".join([d[n] for n in names]))
if not found:
print("No matching value for {} found".format(k))
Output:
padraic#lab:~$ python test.py -i 4583B
indDocID,Word,MovieID,Score
1,love,4583B,.09
2,good,4583B,.37
3,funny,4583B,.64
padraic#lab:~$ python test.py -i foo
No matching value for foo found
If you store the output in lists, you could write it a little more succinctly:
def find_by_val(col, k, fle, delim=","):
with open(fle) as f:
names = next(f).rstrip().split(delim)
if col not in names:
print("Column does not exist.")
return
r, found = csv.DictReader(f, fieldnames=names), False
output = [",".join(d[n] for n in names) for d in r if d[col] == k ]
if not output:
print("No matching value for {} found".format(k))
else:
print(",".join(names))
print("\n".join(output))
You should also probably take the column name as an argument and take multiple values to look for if you want a more general approach.
If you need to match rows only against MoveID using additional argument -i may be unnecessary (additional argument can be helpful if you need to specify, which column we use to match against). You can test the code below with python tfidf.py 4583B
#!/usr/bin/env python
import sys, csv
def search(db, mid):
# open csv file
with open(db, 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for r in reader:
# print line if MovieID matches
if r[2] == mid:
print(','.join(r))
if __name__ == '__main__':
# parse arguments
db, mid = sys.argv[1:]
search(db, mid)

Taxonomy classification of blast output

I am trying to do a taxonomic classification of my blast output, using scripts on this web-site https://github.com/bartaelterman/BlastTaxonomy.
I am quite new to python and sometimes I try to find already existing scripts to do complicated jobs. However, scripts like this confuse me a lot. I have a very basic question, i.e. in the script below, where do I replace with my input file, can you give me a short explanation of the script like this to have an idea?
Also, if you have any other ideas how to do it, would be great!
import sys
sys.path.append("./scripts")
import blasthittaxonomy
def checkarguments():
if len(sys.argv) != 2:
print "usage: ./addTaxonomyToBlastOutput.py <blast output file>"
sys.exit(-1)
def main():
checkarguments()
taxfetcher = blasthittaxonomy.TaxonomyFetcher()
filename = sys.argv[1]
infile = open(filename)
header = infile.readline()
print "\t".join(["seqnr", "hitginr", "hitname", "evalue", "bitscore", "similarity", "score", "division", "scientificName", "rank1", "rank2"])
for line in infile:
newline = line.rstrip("\n")
seqnr, ginr, hitname, evalue, bitscore, sim, score = newline.split("\t")
division = ""
scientName = ""
rank1 = ""
rank2 = ""
if ginr != "":
taxonomy = taxfetcher.getTaxonomy(int(ginr))
if taxonomy != "":
scientName = taxonomy[0]["ScientificName"]
if scientName == "unidentified":
scientName = ""
else:
division = taxonomy[0]["Division"]
try:
rank1 = taxonomy[0]["LineageEx"][0]["ScientificName"]
except:
rank1 = ""
try:
rank2 = taxonomy[0]["LineageEx"][1]["ScientificName"]
except:
rank2 = ""
print "\t".join([seqnr, ginr, hitname, evalue, bitscore, sim, score, division, scientName, rank1, rank2])
taxfetcher.die()
main()
You don't replace anything, just run the script with your file name as a command-line argument
That's where the script reads the input file name:
filename = sys.argv[1]

Categories

Resources