Taxonomy classification of blast output - python

I am trying to do a taxonomic classification of my blast output, using scripts on this web-site https://github.com/bartaelterman/BlastTaxonomy.
I am quite new to python and sometimes I try to find already existing scripts to do complicated jobs. However, scripts like this confuse me a lot. I have a very basic question, i.e. in the script below, where do I replace with my input file, can you give me a short explanation of the script like this to have an idea?
Also, if you have any other ideas how to do it, would be great!
import sys
sys.path.append("./scripts")
import blasthittaxonomy
def checkarguments():
if len(sys.argv) != 2:
print "usage: ./addTaxonomyToBlastOutput.py <blast output file>"
sys.exit(-1)
def main():
checkarguments()
taxfetcher = blasthittaxonomy.TaxonomyFetcher()
filename = sys.argv[1]
infile = open(filename)
header = infile.readline()
print "\t".join(["seqnr", "hitginr", "hitname", "evalue", "bitscore", "similarity", "score", "division", "scientificName", "rank1", "rank2"])
for line in infile:
newline = line.rstrip("\n")
seqnr, ginr, hitname, evalue, bitscore, sim, score = newline.split("\t")
division = ""
scientName = ""
rank1 = ""
rank2 = ""
if ginr != "":
taxonomy = taxfetcher.getTaxonomy(int(ginr))
if taxonomy != "":
scientName = taxonomy[0]["ScientificName"]
if scientName == "unidentified":
scientName = ""
else:
division = taxonomy[0]["Division"]
try:
rank1 = taxonomy[0]["LineageEx"][0]["ScientificName"]
except:
rank1 = ""
try:
rank2 = taxonomy[0]["LineageEx"][1]["ScientificName"]
except:
rank2 = ""
print "\t".join([seqnr, ginr, hitname, evalue, bitscore, sim, score, division, scientName, rank1, rank2])
taxfetcher.die()
main()

You don't replace anything, just run the script with your file name as a command-line argument
That's where the script reads the input file name:
filename = sys.argv[1]

Related

Hashing wordlist with big input output files in Python 3.8

I'm a beginner in coding and am trying to build a script that takes a txt file as an input, hash it and output to another txt file containing "string:hashedstring" in each line of it. The code is working properly. The problem I am facing now is that if the input file is big, it will consume all RAM and kill it. I tried to use chunks, but couldn't figure out how to use it with multiline input and output.
Any suggestions regarding other parts of the code other than the main subject here is very welcome, since I am just starting on this. Thanks.
import argparse
import hashlib
import os
import sys
def sofia_hash(msg):
h = ""
m = hashlib.md5()
m.update(msg.encode('utf-8'))
msg_md5 = m.digest()
for i in range(8):
n = (msg_md5[2*i] + msg_md5[2*i+1]) % 0x3e
if n > 9:
if n > 35:
n += 61
else:
n += 55
else:
n += 0x30
h += chr(n)
return h
top_parser = argparse.ArgumentParser(description='Sofiamass')
top_parser.add_argument('input', action="store", type=argparse.FileType('r', encoding='utf8'), help="Set input file")
top_parser.add_argument('output', action="store", help="Set output file")
args = top_parser.parse_args()
sofiainput = args.input.read().splitlines()
a = 0
try:
while a < len(sofiainput):
target_sofiainput = sofiainput[a]
etarget_sofiainput = (target_sofiainput).encode('utf-8')
try:
sofia_pass = sofia_hash(target_sofiainput)
x = True
except KeyboardInterrupt:
print ("\n[---]exiting now[---]")
if x == True:
with open(args.output, 'a') as sofiaoutput:
sofiaoutput.write(str(target_sofiainput) + ":" + str(sofia_pass) + "\n")
elif x == False:
print('error')
a += 1
except KeyboardInterrupt:
print ("\n[---]exiting now[---]")
except AttributeError:
pass
When you open the file with the open command, it creates a object called file handler. So, when you do:
with open('filepath.txt', 'r') as f:
for line in f:
print(line)
it only keeps the current line you are using in the RAM, thus achieving your objective to use as little as RAM as possible.

Flattening JSON Files for Pizzly Fusion Tables

I am trying to flatten JSON files that come out of the Kallisto -> Pizzly pipeline. The GitHub page provides a python script flatten_json.py (see below). I am still a beginner at python, and haven't been able to figure out how to run this script effectively to generate a gene table. I cannot seem to find any tutorials or vignettes online. Any guidance would be very helpful.
import sys
import json
from collections import OrderedDict
####
# gene1_name gene1_id, gene2_name, gene2_id, type, pair, split, txlist
def loadJSON(fn):
with open(fn) as f:
JJ = json.load(f,object_pairs_hook=OrderedDict)
return JJ['genes']
def outputGeneTable(fusions, outf, filters = None):
outf.write('\t'.join("geneA.name geneA.id geneB.name geneB.id paircount splitcount transcripts.list".split()))
outf.write('\n')
for gf in fusions:
gAname = gf['geneA']['name']
gAid = gf['geneA']['id']
gBname = gf['geneB']['name']
gBid = gf['geneB']['id']
pairs = str(gf['paircount'])
split = str(gf['splitcount'])
txp = [tp['fasta_record'] for tp in gf['transcripts']]
outf.write('\t'.join([gAname, gAid, gBname, gBid, pairs, split, ';'.join(txp)]))
outf.write('\n')
def usage():
print("Usage: python flatten_json.py fusion.out.json [genetable.txt]")
print("")
print(" outputs a flat table listing all gene fusions, if the output file is not")
print(" specified it prints to standard output")
if __name__ == "__main__":
nargs = len(sys.argv)
if nargs <= 1:
usage()
else:
infn = sys.argv[1]
fusions = loadJSON(infn)
outf = sys.stdout
if nargs == 3:
outf = open(sys.argv[2],'w')
outputGeneTable(fusions,outf)
if outf != sys.stdout:
outf.close()

Error in wikipedia subcategory crawling using python3

Hello Community Members,
I am getting the error NameError: name 'f' is not defined. The code is as follows. Please help. Any sort of help is appreciated. I have been strucked onto this since 3 days. The code is all about to extract all the subcategories name of wikipedia category in Python 3.
I have tried both the relative and absolute paths.
The code is as follows:
import httplib2
from bs4 import BeautifulSoup
import subprocess
import time, wget
import os, os.path
#declarations
catRoot = "http://en.wikipedia.org/wiki/Category:"
MAX_DEPTH = 100
done = []
ignore = []
path = 'trivial'
#Removes all newline characters and replaces with spaces
def removeNewLines(in_text):
return in_text.replace('\n', ' ')
# Downloads a link into the destination
def download(link, dest):
# print link
if not os.path.exists(dest) or os.path.getsize(dest) == 0:
subprocess.getoutput('wget "' + link + '" -O "' + dest+ '"')
print ("Downloading")
def ensureDir(f):
if not os.path.exists(f):
os.mkdir(f)
# Cleans a text by removing tags
def clean(in_text):
s_list = list(in_text)
i,j = 0,0
while i < len(s_list):
#iterate until a left-angle bracket is found
if s_list[i] == '<':
if s_list[i+1] == 'b' and s_list[i+2] == 'r' and s_list[i+3] == '>':
i=i+1
print ("hello")
continue
while s_list[i] != '>':
#pop everything from the the left-angle bracket until the right-angle bracket
s_list.pop(i)
#pops the right-angle bracket, too
s_list.pop(i)
elif s_list[i] == '\n':
s_list.pop(i)
else:
i=i+1
#convert the list back into text
join_char=''
return (join_char.join(s_list))#.replace("<br>","\n")
def getBullets(content):
mainSoup = BeautifulSoup(contents, "html.parser")
# Gets empty bullets
def getAllBullets(content):
mainSoup = BeautifulSoup(str(content), "html.parser")
subcategories = mainSoup.findAll('div',attrs={"class" : "CategoryTreeItem"})
empty = []
full = []
for x in subcategories:
subSoup = BeautifulSoup(str(x))
link = str(subSoup.findAll('a')[0])
if (str(x)).count("CategoryTreeEmptyBullet") > 0:
empty.append(clean(link).replace(" ","_"))
elif (str(x)).count("CategoryTreeBullet") > 0:
full.append(clean(link).replace(" ","_"))
return((empty,full))
def printTree(catName, count):
catName = catName.replace("\\'","'")
if count == MAX_DEPTH : return
download(catRoot+catName, path)
filepath = "categories/Category:"+catName+".html"
print(filepath)
content = open('filepath', 'w+')
content.readlines()
(emptyBullets,fullBullets) = getAllBullets(content)
f.close()
for x in emptyBullets:
for i in range(count):
print (" "),
download(catRoot+x, "categories/Category:"+x+".html")
print (x)
for x in fullBullets:
for i in range(count):
print (" "),
print (x)
if x in done:
print ("Done... "+x)
continue
done.append(x)
try: printTree(x, count + 1)
except:
print ("ERROR: " + x)
name = "Cricket"
printTree(name, 0)
The error encountered is as follows.
I think f.close() should be content.close().
It's common to use a context manager for such cases, though, like this:
with open(filepath, 'w+') as content:
(emptyBullets,fullBullets) = getAllBullets(content)
Then Python will close the file for you, even in case of an exception.
(I also changed 'filepath' to filepath, which I assume is the intent here.)

Python - how to optimize iterator in file parsing

I get files that have NTFS audit permissions and I'm using Python to parse them. The raw CSV files list the path and then which groups have which access, such as this type of pattern:
E:\DIR A, CREATOR OWNER FullControl
E:\DIR A, Sales FullControl
E:\DIR A, HR Full Control
E:\DIR A\SUBDIR, Sales FullControl
E:\DIR A\SUBDIR, HR FullControl
My code parses the file to output this:
File Access for: E:\DIR A
CREATOR OWNER,FullControl
Sales,FullControl
HR,FullControl
File Access For: E:\DIR A\SUBDIR
Sales,FullControl
HR,FullControl
I'm new to generators but I'd like to use them to optimize my code. Nothing I've tried seems to work, so here is the original code (I know it's ugly). It works but it's very slow. The only way I can do this is by parsing out the paths first, put them in a list, make a set so that they're unique, then iterate over that list and match them with the path in the second list, and list all of the items it finds. Like I said, it's ugly but works.
import os, codecs, sys
reload(sys)
sys.setdefaultencoding('utf8') // to prevent cp-932 errors on screen
file = "aud.csv"
outfile = "access-2.csv"
filelist = []
accesslist = []
with codecs.open(file,"r",'utf-8-sig') as infile:
for line in infile:
newline = line.split(',')
folder = newline[0].replace("\"","")
user = newline[1].replace("\"","")
filelist.append(folder)
accesslist.append(folder+","+user)
newfl = sorted(set(filelist))
def makeFile():
print "Starting, please wait"
for i in range(1,len(newfl)):
searchItem = str(newfl[i])
with codecs.open(outfile,"a",'utf-8-sig') as output:
outtext = ("\r\nFile access for: "+ searchItem + "\r\n")
output.write(outtext)
for item in accesslist:
searchBreak = item.split(",")
searchTarg = searchBreak[0]
if searchItem == searchTarg:
searchBreaknew = searchBreak[1].replace("FSA-INC01S\\","")
searchBreaknew = str(searchBreaknew)
# print(searchBreaknew)
searchBreaknew = searchBreaknew.replace(" ",",")
searchBreaknew = searchBreaknew.replace("CREATOR,OWNER","CREATOR OWNER")
output.write(searchBreaknew)
How should I optimize this?
EDIT:
Here is an edited version. It works MUCH faster, though I'm sure it can still be fixed:
import os, codecs, sys, csv
reload(sys)
sys.setdefaultencoding('utf8')
file = "aud.csv"
outfile = "access-3.csv"
filelist = []
accesslist = []
with codecs.open(file,"r",'utf-8-sig') as csvinfile:
auditfile = csv.reader(csvinfile, delimiter=",")
for line in auditfile:
folder = line[0]
user = line[1].replace("FSA-INC01S\\","")
filelist.append(folder)
accesslist.append(folder+","+user)
newfl = sorted(set(filelist))
def makeFile():
print "Starting, please wait"
for i in xrange(1,len(newfl)):
searchItem = str(newfl[i])
outtext = ("\r\nFile access for: "+ searchItem + "\r\n")
accessUserlist = ""
for item in accesslist:
searchBreak = item.split(",")
if searchItem == searchBreak[0]:
searchBreaknew = str(searchBreak[1]).replace(" ",",")
searchBreaknew = searchBreaknew.replace("R,O","R O")
accessUserlist += searchBreaknew+"\r\n"
with codecs.open(outfile,"a",'utf-8-sig') as output:
output.write(outtext)
output.write(accessUserlist)
I'm misguided from your used .csv file extension.
Your given expected output isn't compatible with csv, as inside a record no \n possible.
Proposal using a generator returning record by record:
class Audit(object):
def __init__(self, fieldnames):
self.fieldnames = fieldnames
self.__access = {}
def append(self, row):
folder = row[self.fieldnames[0]]
access = row[self.fieldnames[1]].strip(' ')
access = access.replace("FSA-INC01S\\", "")
access = access.split(' ')
if len(access) == 3:
if access[0] == 'CREATOR':
access[0] += ' ' + access[1]
del access[1];
elif access[1] == 'Full':
access[1] += ' ' + access[2]
del access[2];
if folder not in self.__access:
self.__access[folder] = []
self.__access[folder].append(access)
# Generator for class Audit
def __iter__(self):
record = ''
for folder in sorted(self.__access):
record = folder + '\n'
for access in self.__access[folder]:
record += '%s\n' % (','.join(access) )
yield record + '\n'
How to use it:
def main():
import io, csv
audit = Audit(['Folder', 'Accesslist'])
with io.open(file, "r", encoding='utf-8') as csc_in:
for row in csv.DictReader(csc_in, delimiter=","):
audit.append(row)
with io.open(outfile, 'w', newline='', encoding='utf-8') as txt_out:
for record in audit:
txt_out.write(record)
Tested with Python:3.4.2 - csv:1.0

Unable to execute the else part of the if in the last line

When string is matched to the index without the else block in the if statment code works as expected.
But when else part is added it always displays "No string"
import sys
def string_search_index():
'''
This function search a string in a file through index and gives the result.
'''
if len(sys.argv) != 3:
print "Enter Two Arguments Only"
sys.exit(1)
stringsrch = sys.argv[2]
#size = len(sys.argv)
file_name = open("passwd", "r")
#print "No of Argu ", size
if sys.argv[1].isdigit():
fieldindex = int(sys.argv[1])-1
else:
print "Enter Integer in 1st Argument"
sys.exit(1)
fieldindex = int(sys.argv[1])-1
for store_file in file_name:
temp = store_file.split(":")
search = temp[fieldindex]
#print search
if stringsrch in search:
print store_file
else:
print "No String"
sys.exit(1)
string_search_index()
The string not found test should be outside the for loop:
file = open("my_file", "r")
string_found = False
# Loop over the lines of the file
for line in file:
temp = line.split(":")
if test_string in temp[index]:
print('Found it : {}'.format(line))
string_found = True
# String not found case
if not string_found:
print("String not found")

Categories

Resources