################################################################################
#...................Program to create feature vector (N-grams) ...............
################################################################################
import ast
import csv
import os
import sys
from string import *
from BST import Node
import ast
maxInt = sys.maxsize
decrement = True
while decrement:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
decrement = False
try:
csv.field_size_limit(maxInt)
except OverflowError:
maxInt = int(maxInt/10)
decrement = True
def File_Write(filename,write_ist):
filewrite=open(filename,"w")
filewrite.writerows(str(write_ist))
filewrite.close()
#################################################################################
#................Function to read the files and create the FVT...................
# Change 2,3,4,5,6,7
#################################################################################
def read_file_list(bigram,class_label) :
frqlist=[]
root.reset_frequency()
for i in range (0,len(bigram)):
row=str(bigram[i])
row = row.strip()
row = row.replace('(','')
row = row.replace(')','')
row = row.replace("'",'')
row = row.replace(",",' ')
node, parent = root.lookup(row)
if node :
root.increment_frequency(node)
frqlist = root.print_tree()
## Attach the class_label............
root.finalize_frq_lst(class_label )
root.write_to_csv(file_write1)
##################################################################################
#.................................MAIN PROGRAM....................................
##################################################################################
feature_list = ""
root_flag = 'false'
file_path_data = "/home/xxx/Project/Dataset/Cross/N_grams/7_POStags.csv" ;##Input file containing Bigrams of blogdata
fp_data=csv.reader(open(file_path_data,"r"),delimiter=',')
file_path_feature = "/home/xxx/Project/Dataset/Cross/N_gram_Features/7_gram.txt" ;##Input file containing sorted Bigrams
fp_feature=open(file_path_feature,"r")
list1=fp_feature.read()
#### Convert String into list ...................
read_list=ast.literal_eval(list1)
read_list1=list(set(read_list))
print read_list1
for i in range(0,len(read_list)):
feature=str(read_list[i])
feature = feature.strip()
feature = feature.replace('(','')
feature = feature.replace(')','')
feature = feature.replace("'",'')
feature = feature.replace(",",' ')
if root_flag == 'false':
root = Node( feature )
root_flag = 'true'
else :
root.insert(feature)
feature_list = feature_list + "\n" + feature
feature_list1 = feature_list.strip()
line = feature_list1.split('\n')
##print "#######################################################################"
##print line
line1 = list(set(line))
print len(line1)
##print "#######################################################################"
line1.sort()
i=1
######Setting the path for input and output files .......................
output_file = "/home/xxx/Project/Dataset/Cross/N_grams_recored/7_gram.csv" ;##Output file..............
with open(output_file,'w') as fo :
file_write1 = csv.writer(fo, delimiter=',', quotechar='"')
file_write1.writerow(line1)
#### Write header data into output file
for data in fp_data :
feature=ast.literal_eval(data[0])
class_label=data[1]
read_file_list(feature,class_label)
print feature
print i
i=i+1
This is my code to record count of 7-grams in a 3277 sample data.i am trying to create BST of 76000 something 7-grams.but am getting error like this
Traceback (most recent call last):
File "N_gram_Record (2-7).py", line 79, in <module>
read_list=ast.literal_eval(list1)
File "/usr/lib/python2.7/ast.py", line 49, in literal_eval
node_or_string = parse(node_or_string, mode='eval')
File "/usr/lib/python2.7/ast.py", line 37, in parse
return compile(source, filename, mode, PyCF_ONLY_AST)
MemoryError
i think memory error is getting when am trying to create bst of 7-grams since its of 76000 in count.so any idea to overcome this problem???
Related
I'm trying to run the below python script (vcf2treemix.py) with the command
<./vcf2treemix.py -vcf allsamples14_filtered_1_autosomes38_bisnps.vcf.gz -pop allsamples14.clust.pop>
I got this error with both python 2 and 3
######### error ###
Traceback (most recent call last):
File "./vcf2treemix.py", line 99, in <module>
main()
File "./vcf2treemix.py", line 95, in main
pop_obj = get_pops(pop_file)
File "./vcf2treemix.py", line 34, in get_pops
pops[fields[0]] = fields[1].split()
IndexError: list index out of range
######### vcf2treemix.py ###
#!/usr/bin/python
# vcf2treemix.py
# Converts a vcf file into TreeMix input
import argparse
from collections import OrderedDict
parser = argparse.ArgumentParser(description="Parsing statistical output of"
" VCFtools")
parser.add_argument("-vcf", dest="vcf_file", help="/mnt/ursus/GROUP-sbifh3/c1845371/whole_genome/data_dog/align_out/treemix/allsamples14_filtered_1_autosomes38_bisnps_main.vcf.gz",
required=True)
parser.add_argument("-pop", dest="pop_file", help="/mnt/ursus/GROUP-sbifh3/c1845371/whole_genome/data_dog/align_out/treemix/allsamples14.clust.pop",
required=True)
arg = parser.parse_args()
def get_pops(pop_file):
"""
Returns a dictionary with pop identifier as key and taxa as a list of
strings. In the pop file, each populations should be in one line, starting
withe pop name, a colon and the corresponding taxa separated by whitespace.
E.g.:
pop1: taxon1 taxon2 taxon3
"""
pops = OrderedDict()
with open(pop_file) as fh:
for line in fh:
fields = line.strip().split(":")
pops[fields[0]] = fields[1].split()
return pops
def vcf2treemix(vcf_file, pop_obj):
"""
Converts a vcf file into treemix format.
"""
vcf_fh = open(vcf_file)
output_name = vcf_file.strip(".vcf") + ".tmix"
output_fh = open(output_name, "w")
# Write header for tmix file
output_fh.write("{}\n".format(" ".join([x for x in pop_obj.keys()])))
for line in vcf_fh:
# Skip header
if line.startswith("##"):
pass
# Get taxon positions
elif line.startswith("#CHROM"):
taxa_pos = line.strip().split()
# Ignore empty lines
elif line.strip() != "":
fields = line.strip().split()
# Ignore loci with more than two alleles
if len(fields[4]) > 1:
continue
# Get allele counts for each populations
temp_pop = OrderedDict((x, [0,0]) for x in pop_obj.keys())
for pop, taxa in pop_obj.items():
for taxon in taxa:
# Get taxon genotype
gen = fields[taxa_pos.index(taxon)]
# Skip if gen is missing data
if gen == "./.":
continue
temp_pop[pop][0] += gen.count("0")
temp_pop[pop][1] += gen.count("1")
# Write current locus to file
output_fh.write("{}\n".format(" ".join([str(x[0]) + "," + str(x[1]) for x in temp_pop.values()])))
vcf_fh.close()
output_fh.close()
def main():
# Args
vcf_file = arg.vcf_file
pop_file = arg.pop_file
pop_obj = get_pops(pop_file)
vcf2treemix(vcf_file, pop_obj)
main()
I have zero experience with python and I just run the script to manipulate genetic data.
Any help will be highly appreciable.
Thanks
Ali
I tried python 2 and 3 and I expect the script to work straightforward. I think there is no problem with the input data.
I am currently working on a project for which I need to download a few thousand citations from PubMed. I am currently using BioPython and have written this code:
from Bio import Entrez
from Bio import Medline
from pandas import *
from sys import argv
import os
Entrez.email = "my_email"
df = read_csv("my_file_path")
i=0
for index, row in df.iterrows():
print (row.id)
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=row.id)
records = Medline.parse(handle)
for record in records:
try:
abstract = str(record["AB"])
except:
abstract = "none"
try:
title = str(record["TI"])
except:
title = "none"
try:
mesh = str(record["MH"])
except:
mesh = "none"
path = 'my_file_path'
filename= str(row.id) + '.txt'
filename = os.path.join(path, filename)
file = open(filename, "w")
output = "title: "+str(title) + "\n\n" + "abstract: "+str(abstract) + "\n\n" + "mesh: "+str(mesh) + "\n\n"
file.write(output)
file.close()
print (i)
i=i+1
However, I receive the following error when this code is run:
Traceback (most recent call last):
File "my_file_path", line 13, in <module>
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=row.id)
File "/.../anaconda/lib/python3.5/site-packages/biopython-1.68-py3.5-macosx-10.6-x86_64.egg/Bio/Entrez/__init__.py", line 176, in efetch
if ids.count(",") >= 200:
AttributeError: 'numpy.int64' object has no attribute 'count'
Here are the first few columns of the CSV file:
id
10029645
10073846
10078088
10080457
10088066
...
Your error is at
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=row.id)
From the documentation
id
UID list. Either a single UID or a comma-delimited list of UIDs
From the examples I see, id is a string, not a numpy.int64 out of a pandas dataframe. You should convert that row.id to a string
I have two files. One creates a numpy array in compressed sparse row format
from sklearn.feature_extraction.text import TfidfTransformer
import pdb
def stem_document(document):
translatedict = ""
stemmer = PorterStemmer()
for word in string.punctuation:
translatedict = translatedict + word
doc_stemmed = []
for word in document.split():
lowerstrippedword = ''.join(c for c in word.lower() if c not in translatedict)
try:
stemmed_word = stemmer.stem(lowerstrippedword)
doc_stemmed.append(stemmed_word)
except:
print lowerstrippedword + " could not be stemmed."
return ' '.join(doc_stemmed)
def readFileandStem(filestring):
with open(filestring, 'r') as file:
reader = csv.reader(file)
file_extras = []
vector_data = []
error = False
while (error == False):
try:
next = reader.next()
if len(next) == 3 and next[2] != "":
document = next[2]
stemmed_document = stem_document(document)
vector_data.append(stemmed_document)
file_extra = []
file_extra.append(next[0])
file_extra.append(next[1])
file_extras.append(file_extra)
except:
error = True
return [vector_data, file_extras]
filestring = 'Data.csv'
print "Reading File"
data = readFileandStem(filestring)
documents = data[0]
file_extras = data[1]
print "Vectorizing Data"
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(documents)
tf_idf_transform = TfidfTransformer(use_idf=False).fit(matrix)
tf_idf_matrix = tf_idf_transform.transform(matrix)
with open('matrix/matrix.npy', 'w') as matrix_file:
np.save(matrix_file, tf_idf_matrix)
file_json_map = {}
file_json_map['extras'] = file_extras
with open('matrix/extras.json', 'w') as extras_file:
extras_file.write(json.dumps(file_json_map))
print "finished"
The next file is supposed to load the same file...
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
import json
import pdb
with open('matrix/matrix.npy', 'r') as matrix_file:
matrix = np.load(matrix_file)
hcluster = linkage(matrix, "complete")
However, I get the following error:
File "Cluster.py", line 7, in <module>
matrix = np.load(matrix_file)
File "C:\Users\jarek\Anaconda2\lib\site-packages\numpy\lib\npyio.py", line 406, in load
pickle_kwargs=pickle_kwargs)
File "C:\Users\jarek\Anaconda2\lib\site-packages\numpy\lib\format.py", line 620, in read_array
version = read_magic(fp)
File "C:\Users\jarek\Anaconda2\lib\site-packages\numpy\lib\format.py", line 216, in read_magic
raise ValueError(msg % (MAGIC_PREFIX, magic_str[:-2]))
ValueError: the magic string is not correct; expected '\x93NUMPY', got '\x00\x00I\x1c\x00\x00'
I don't know why the magic string would be incorrect because from what I've looked into, all .npy files are supposed to have the same magic string "\x93NUMPY".
Ideas?
I encountered similar issue before.
Changing
open('matrix/matrix.npy', 'w')
...
open('matrix/matrix.npy', 'r')
to
open('matrix/matrix.npy', 'wb')
...
open('matrix/matrix.npy', 'rb')
solved my problem.
Traceback (most recent call last):
File "C:\Users\Brett\Desktop\Jefferson_final.py", line 31, in
point = arcpy.Point(coordinate[0],coordinate[1])
IndexError: list index out of range
Here is my script
import arcpy
import fileinput
import os
import string
arcpy.env.workspace = "C:\Users\Brett\Desktop\San Clemente"
arcpy.env.overwriteOutput = True
outFolder = "C:\Users\Brett\Desktop\San Clemente"
output = open("result.txt", "w")
fc = "metersSC1.shp"
inputFile = "C:\Users\Brett\Desktop\San Clemente\San Clemente Lat Long.txt"
name = ""
for line in inputFile:
lineSegment = line.split(": ")
if lineSegment[0]== "spatialReference":
spatRef = spatRef = arcpy.SpatialReference(26946)
arcpy.CreateFeatureclass_management(outFolder, fc, "POINT", "", "", "", spatRef)
arcpy.AddField_management(fc, "NAME", "TEXT")
cursor = arcpy.da.InsertCursor(fc, ["","SHAPE#"])
array = arcpy.Array()
elif lineSegment[0] =="NAME":
if len(array) > 0:
polygon = arcpy.Polygon(pointList)
cursor.insertRow((name,polygon))
name = lineSegment[0]
else:
coordinate = line.split(",")
point = arcpy.Point(coordinate[0],coordinate[1])
pointList.add(point)
polygon = arcpy.Polygon(array)
cursor.insertRow((name, polygon))
print "COORDINATES ADDED"
You are not opening the file input_data and reading the input data from it then splitting the data.
inputname = r"C:\Users\Brett\Desktop\San Clemente\San Clemente Lat Long.txt"
inputdata = [l.spilt(',') for l in open(inputname).readlines()]
Should get you further - you will also have to convert x & y to numeric values they will be strings to start with.
Long time listener, first time caller! So, I have this Python script that is for parsing a Google Base Feed text file. It's taking out particular pieces of data and creating a formatted file I can upload on to Bing Shopping. After finally getting it to run, I've discovered that it just outputs blank files instead of the cleaned up data I wanted. What am I missing here? I really appreciate any help! Fair warning, I'm a pretty big Python newb, and I've had a lot of help writing this already.
import sys,os
import pandas as pd
import datetime
def remove_quotes(data):
lines = data.split('\n')
for i, line in enumerate(lines):
lines[i] = lines[i].replace('"','')
print lines[i]
return data
def tab_error(index, line, output):
count = len(line.split('\t'))
if count != 19:
err = 'Tab issue at line {linenum} : {numtabs} extra tabs'.\
format(linenum=index,numtabs=(count-19))
print err
output.write(err+'\n')
return True
return False
def html_error(index, line, output):
htmltags = ['&fract12', '&39','&', '&qt;', '<', '&rt;','"','>','quot','’']
for tag in htmltags:
if line.find(tag) > 0:
err = 'HTML issue at line {linenum}'.\
format(linenum=index)
print err
output.write(err+'\n')
return True
return False
def read_data(filename):
with open(filename,'r') as infile:
data = infile.read()
return data
def tabs_check(data, output, filename):
with open(filename,'w') as cleanfile:
header = ''
for x in xrange(19):
header += 'x'+str(x+1)+'\t'
cleanfile.write(header)
# for each line in the file
for i, line in enumerate(data.split('\r')[1:]):
# check line for tabs error
data_error = tab_error(i, line, output)
newline = line.replace('"','')
newline=newline.strip()
if not data_error:
cleanfile.write('\n'+newline)
def html_check(data, output, filename):
with open(filename,'w') as cleanfile:
# for each line in the file
lines = data.split('\n')
cleanfile.write(lines[0])
for i, line in enumerate(lines[1:]):
# check line for HTML errors
data_error = html_error(i, line, output)
newline = line.replace('"','')
newline=newline.strip()
if not data_error and newline:
cleanfile.write('\n'+newline)
if __name__ == '__main__':
# Clean tabs
filename = sys.argv[1]
ts = datetime.datetime.now().isoformat()
print ts
with open('bing_errors.txt','w') as output:
# print 'Removing quotes within .. product description and ...'
# data = remove_quotes(data)
print 'Removing lines with more than 19 tabs...'
data = read_data(filename)
tabs_check(data, output, 'clean19.txt')
# Delete and reorder columns
print 'Deleting and reordering columns...'
df = pd.read_table('clean19.txt')
tmp = df[['x8','x2','x3','x4','x6','x1','x5']]
tmp.columns = ['MPID',
'Brand (BrandorManufacturer)',
'Title',
'Item Description',
'Price',
'ProductURL',
'ImageURL']
tmp.to_csv('tmp.txt', index=False, sep='\t')
os.remove('clean19.txt')
#HTML errors
print 'Checking for HTML errors...'
data = read_data('tmp.txt')
html_check(data, output, 'BT1.txt')
os.remove('tmp.txt')
# row = tmp[tmp['MPID'] == 8724]
# print row