I'm trying to find the optimal way to append some data to a json file using Python. Basically what happens is I have about say 100 threads open storing data to an array. When they are done they send that to a json file using json.dump. However since this can take a few hours for the array to build up I end up running out of RAM eventually. So I'm trying to see what's the best way to use the least amount of RAM in this process. The following is what I have which consumes to much RAM.
i = 0
twitter_data = {}
for null in range(0,1):
while True:
try:
for friends in Cursor(api.followers_ids,screen_name=self.ip).items():
twitter_data[i] = {}
twitter_data[i]['fu'] = self.ip
twitter_data[i]['su'] = friends
i = i + 1
except tweepy.TweepError, e:
print "ERROR on " + str(self.ip) + " Reason: ", e
with open('C:/Twitter/errors.txt', mode='a') as a_file:
new_ii = "ERROR on " + str(self.ip) + " Reason: " + str(e) + "\n"
a_file.write(new_ii)
break
## Save data
with open('C:/Twitter/user_' + str(self.id) + '.json', mode='w') as f:
json.dump(twitter_data, f, indent=2, encoding='utf-8')
Thanks
Output the individual items as an array as they're created, creating the JSON formatting for the array around it manually. JSON is a simple format, so this is trivial to do.
Here's a simple example that prints out a JSON array, without having to hold the entire contents in memory; only a single element in the array needs to be stored at once.
def get_item():
return { "a": 5, "b": 10 }
def get_array():
results = []
yield "["
for x in xrange(5):
if x > 0:
yield ","
yield json.dumps(get_item())
yield "]"
if __name__ == "__main__":
for s in get_array():
sys.stdout.write(s)
sys.stdout.write("\n")
My take, building on the idea from Glenn's answer but serializing a big dict as requested by the OP and using the more pythonic enumerate instead of manually incrementing i (errors can be taken into account by keeping a separate count for them and subtracting it from i before wriring to f):
with open('C:/Twitter/user_' + str(self.id) + '.json', mode='w') as f:
f.write('{')
for i, friends in enumerate(Cursor(api.followers_ids,screen_name=self.ip).items()):
if i>0:
f.write(", ")
f.write("%s:%s" % (json.dumps(i), json.dumps(dict(fu=self.ip, su=friends))))
f.write("}")
Related
I'm trying to write a code that will take data from a file and write it differently. I have the code for the most part but when i run it, everything is on one line.
import csv
#Step 4
def read_data(filename):
try:
data = open("dna.txt", "r")
except IOError:
print( "File not found")
return data
#Step 5
def get_dna_stats(dna_string):
a_letters = ""
t_letters = ""
if "A" in dna_string:
a_letters.append("A")
if "T" in dna_string:
t_letters.append("T")
nucleotide_content = ((len(a_letters) + len(t_letters))/len(dna_string))
#Step 6
def get_dna_complement(dna_string):
dna_complement = ""
for i in dna_string:
if i == "A":
dna_complement.append("T")
elif i == "T":
dna_complement.append("A")
elif i == "G":
dna_complement.append("C")
elif i == "C":
dna_complement.append("G")
else:
break
return dna_complement
#Step 7
def print_dna(dna_strand):
dna_complement = get_dna_complement(dna_strand)
for i in dna_strand:
for j in dna_complement:
print( i + "=" + j)
#Step 8
def get_rna_sequence(dna_string):
rna_complement = ""
for i in dna_string:
if i == "A":
rna_complement.append("U")
elif i == "T":
rna_complement.append("A")
elif i == "G":
rna_complement.append("C")
elif i == "C":
rna_complement.append("G")
else:
break
return rna_complement
#Step 9
def extract_exon(dna_strand, start, end):
return (f"{dna_strand} between {start} and {end}")
#Step 10
def calculate_exon_pctg(dna_strand, exons):
exons_length = 0
for i in exons:
exons_length += 1
return exons_length/ len(dna_strand)
#Step 11
def format_data(dna_string):
x = "dna_strand"[0:62].upper()
y = "dna_strand"[63:90].lower()
z = "dna_strand"[91:-1].upper()
return x+y+z
#Step 12
def write_results(output, filename):
try:
with open("output.csv","w") as csvFile:
writer = csv.writer(csvFile)
for i in output:
csvFile.write(i)
except IOError:
print("Error writing file")
#Step 13
def main():
read_data("dna.txt")
output = []
output.append("The AT content is" + get_dna_stats() + "% of the DNA sequence.")
get_dna_stats("dna_sequence")
output.append("The DNA complement is " + get_dna_complement())
get_dna_complement("dna_sequence")
output.append("The RNA sequence is" + get_rna_sequence())
get_rna_sequence("dna_sequence")
exon1 = extract_exon("dna_sequence", 0, 62)
exon2 = extract_exon("dna_sequence", 91, len("dna_sequence"))
output.append(f"The exon regions are {exon1} and {exon2}")
output.append("The DNA sequence, which exons in uppercase and introns in lowercase, is" + format_dna())
format_data("dna_sequence")
output.append("Exons comprise " + calculate_exon_pctg())
calculate_exon_pctg("dna_sequence",[exon1, exon2])
write_results(output, "results.txt")
print("DNA processing complete")
#Step 14
if __name__ == "__main__":
main()
When I run it, its supposed to output a file that looks like this but my code ends up putting every word on the top line like this
I have a feeling it has to do with the write_resultsfunction but that's all i know on how to write to the file.
The second mistake I'm making is that I'm not calling the functions correctly in the append statements. I've tried concatenating and I've tried formatting the string but now I'm hitting a road block on what I need to do.
When you write to the file you need to concat a '\n' to the end of the string every time you want to have something on a new line in the written file
for example:
output.append("The AT content is" + get_dna_stats() + "% of the DNA sequence." + '\n')
To solve your second problem I would change your code to something like this:
temp = "The AT content is" + get_dna_stats() + "% of the DNA sequence." + '\n'
output.append(temp)
When you append to a list and call a function it will take the literal text of the function instead of calling it. Doing it with a temp string holder will call the function before the string is concatenated. Then you are able to append the string to the list
read_data() doesn't actually read anything (just opens file). It should read the file and return its contents:
def read_data(filename):
with open(filename, "r") as f:
return f.read()
get_dna_stats() won't get DNA stats (won't return anything, and it doesn't count "A"s or "T"s, only checks if they're present, nucleotide_content is computed but never used or returned. It should probably count and return the results:
def get_dna_stats(dna_string):
num_a = dna_string.count("A")
num_t = dna_string.count("T")
nucleotide_content = (num_a + num_t) /float(len(dna_string))
return nucleotide_content
get_dna_complement() and get_rna_sequence(): you can't append to a string. Instead use
dna_complement += "T"
... and rather than break, you either append a "?" to denote a failed transscription, or raise ValueError("invalid letter in DNA: "+i)
print_dna() is a bit more interesting. I'm guessing you want to "zip" each letter of the DNA and its complement. Coincidentally, you can use the zip function to achieve just that:
def print_dna(dna_strand):
dna_complement = get_dna_complement(dna_strand)
for dna_letter, complement in zip(dna_strand, dna_complement):
print(dna_letter + "=" + complement)
As for extract_exon(), I don't know what that is, but presumably you just want the substring from start to end, which is achieved by:
def extract_exon(dna_strand, start, end):
return dna_strand[start:end] # possibly end+1, I don't know exons
I am guessing that in calculate_exon_pctg(), you want exons_length += len(i) to sum the lengths of the exons. You can achieve this by using the buildin function sum:
exons_length = sum(exons)
In function format_data(), loose the doublequotes. You want the variable.
main() doesn't pass any data around. It should pass the results of read_data() to all the other functions:
def main():
data = read_data("dna.txt")
output = []
output.append("The AT content is " + get_dna_stats(data) + "% of the DNA sequence.")
output.append("The DNA complement is " + get_dna_complement(data))
output.append("The RNA sequence is" + get_rna_sequence(data))
...
write_results(output, "results.txt")
print("DNA processing complete")
The key for you at this stage is to understand how function calls work: they take data as input parameters, and they return some results. You need to a) provide the input data, and b) catch the results.
write_results() - from your screenshot, you seem to want to write a plain old text file, yet you use csv.writer() (which writes CSV, i.e. tabular data). To write plain text,
def write_results(output, filename):
with open(filename, "w") as f:
f.write("\n".join(output)) # join output lines with newline
f.write("\n") # extra newline at file's end
If you really do want a CSV file, you'll need to define the columns first, and make all the output you collect fit that column format.
You never told your program to make a new line. You could either append or prepend the special "\n" character to each of your strings or you could do it in a system agnostic way by doing
import os
at the top of your file and writing your write_results function like this:
def write_results(output, filename):
try:
with open("output.csv","w") as csvFile:
writer = csv.writer(csvFile)
for i in output:
csvFile.write(i)
os.write(csvFile, os.linesep) # Add this line! It is a system agnostic newline
except IOError:
print("Error writing file")
I am using windows10 and python 2.7.14. Running the python scripts in command prompt. I want to read some lines in text file and compare with some text, if it matches it should be stored in array. And also I want the array should be global. But In my script the I am not able to store the contents in array. How do I achieve this.
#This method is to reading logfile and saving the different datas in different lists
def Readlogs(Filename):
datafile = file(Filename)
for line in datafile:
if "login = " in line:
print(line)
trial=line
s2 = "= "
ArrayLogin = trial[trial.index(s2) + len(s2):]
print(ArrayLogin)
print(ArrayLogin)
if "Overlay = " in line:
print(line)
trial2=line
s2 = "= "
arrayOverlay = trial2[trial2.index(s2) + len(s2):]
print(arrayOverlay)
Readlogs(WriteFileName)
You can declare empty arrays and append items to it.
#This method is to reading logfile and saving the different datas in different lists
def Readlogs(Filename):
#empty array
ArrayLogin, arrayOverlay = [], []
datafile = file(Filename)
for line in datafile:
if "login = " in line:
print(line)
trial=line
s2 = "= "
ArrayLogin.append(trial[trial.index(s2) + len(s2):])
print(ArrayLogin)
print(ArrayLogin)
if "Overlay = " in line:
print(line)
trial2=line
s2 = "= "
arrayOverlay.append(trial2[trial2.index(s2) + len(s2):])
print(arrayOverlay)
return ArrayLogin, arrayOverlay
arr1, arr2, = Readlogs(WriteFileName)
I have code like this:
def export_devices():
code = input("Enter device code: ")
amount = int(input("How many devices you export: "))
with open("uredjaji.txt", "r+") as f:
current_position = 0
line = f.readline()
while line:
if line[:len(code) + 1] == code + ":":
line = line.rstrip()
amount_index = line.rfind(":") + 1
current_amount = int(line[amount_index:])
if amount > current_amount:
print("There no that many devices in stock...")
return
remaining_content = f.read()
f.seek(current_position)
f.truncate()
line = line[:amount_index] + str(current_amount - amount) + "\n"
f.write(line)
f.write(remaining_content)
return
current_position = f.tell()
line = f.readline()
with open('transakcije.txt','a') as transactions:
date = datetime.date.today().strftime('%d.%m.%Y.')
transactions.write("1" + ":" + str(amount) + ":" + "export" + ":" + str(date) + ":" + "username" + "\n")
print("Error device code: {}".format(code))
Now I would like to my "transakcije.txt" looks like this:
1:3:iznos:17.06.2017.:username
But it always append the same line for three times. With any other kind of indentation it won't append at all.
Also, my uredjaji.txt file looks like this:
tw004:Galaxy S5:Samsung:Mobilni telefon:3
tw002:Galaxy S6:Samsung:Mobilni telefon:1
tw001:Huawei P8:Huawei:Mobilni telefon:1
tw003:Huawei P9:Huawei:Mobilni telefon:100998
P.S: "username" should be variable from another function, so if someone could help me how to write that variable in this file I will be so thankfull. :)
When you open the file, you read a single line and then go into a while loop on the existence of line. If you do not get a match on the input code, you then attempt to, I suppose, reposition the file pointer with f.tell() but you do not do a seek. Thereafter, you read the file again and write transakcije.txt. Sadly, the original while loop is still in play, so you will write transakcije.txt multiple times.
It is not clear what you are attempting to achieve with this code but you need to sit down and rethink it from the ground up.
If it is some sort of stock reporting/replenishment routine, I can't help thinking that a database (sqlite3 as a simple starter) would be more appropriate that pulling ascii files apart.
I have currently this:
def download_dropbox(url, pre_file_name):
file = url[42:]
file = file[:-5]
file_name = pre_file_name + file
print('Downloading from ' + url + ' to ' + file_name)
print(file)
u = urllib.request.urlopen(url)
data = u.read()
u.close()
with open(file_name, "wb") as f:
f.write(data)
print('Download Completed from ' + url + ' and saved to ' + file_name)
This basically downloads files from dropbox and saves it to a directory. However I want to be able to have some sort of text progress bar like:
[==== ]50%
OR
50%
The hard part i would think is doing it with any external modules like the loading bar module, etc. Also, as the title states, I need it in python 3. Thank-you.
Edit:
Thanks to Martin Evans for the data read while loop and progress bar here is the end result of the code:
#Get the total number of bytes of the file to download before downloading
print ("opening url:", url)
u = urllib.request.urlopen(url)
meta = u.info()
print(str(meta).split())
metaInfo = str(meta).split()
print(len(metaInfo))
print ("Content-Length:" + metaInfo[46] + " bytes")
fileTotalbytes=int(metaInfo[46])
data_blocks = []
# total = int(metaInfo[46])
total=0
while True:
block = u.read(1024)
data_blocks.append(block)
total += len(block)
hash = ((60*total)//fileTotalbytes)
print("[{}{}] {}%".format('#' * hash, ' ' * (60-hash), int(total/fileTotalbytes*100)), end="\r")
if not len(block):
break
data=b''.join(data_blocks) #had to add b because I was joining bytes not strings
u.close()
with open('test.zip', "wb") as f:
f.write(data)
To answer your main question, how to make a text progress bar, you could use something like the following to give you an idea:
import time
for n in range(1,101):
hash = ((60*n)//100)
print("[{}{}] {}%".format('#' * hash, ' ' * (60-hash), n), end="\r")
time.sleep(0.05)
This would give you the following:
[########################### ] 45%
Your main problem though is that there is no obvious way to determine how many bytes will eventually be downloaded unless you already know the exact size of the item being downloaded beforehand. If you control the server end then you could arrange for the length to be obtained before starting.
You can though start by at least converting your read() line to something like the following:
u = urllib.request.urlopen(url)
data_blocks = []
total = 0
while True:
block = fd.read(1024)
data_blocks.append(block)
total += len(block)
print("Downloaded {} bytes".format(total), end="\r")
if not len(block):
break
data = "".join(data_blocks)
u.close()
By doing it this way, you read it a bit at a time and can then provide feedback.
You can use print with \r at the start to go to the start of the line and write over the previous text (so you need to write spaces if you want to clear a character). Here's a simple example:
from time import sleep
x = 0
while x < 20:
print('\r' + '.' * x, end="")
x += 1
sleep(0.1)
I have created a script that successfully searches for keywords (specified by user) within a Blastx output file in XML format. Now, I need to write those records (query, hit, score, evalue, etc) that contain the keyword in the alignment title to a new file.
I have created separate lists for each of the query titles, hit title, e-value and alignment lengths but cannot seem to write them to a new file.
Problem #1: what if Python errors, and one of the lists is missing a value...? Then all the other lists will be giving wrong information in reference to the query ("line slippage", if you will...).
Problem #2: even if Python doesn't error, and all the lists are the same length, how can I write them to a file so that the first item in each list is associated with each other (and thus, item #10 from each list is also associated?) Should I create a dictionary instead?
Problem#3: dictionaries have only a single value for a key, what if my query has several different hits? Not sure if it will be overwritten or skipped, or if it will just error. Any suggestions? My current script:
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import re
#obtain full path to blast output file (*.xml)
outfile = input("Full path to Blast output file (XML format only): ")
#obtain string to search for
search_string = input("String to search for: ")
#open the output file
result_handle = open(outfile)
#parse the blast record
blast_records = NCBIXML.parse(result_handle)
#initialize lists
query_list=[]
hit_list=[]
expect_list=[]
length_list=[]
#create 'for loop' that loops through each HIGH SCORING PAIR in each ALIGNMENT from each RECORD
for record in blast_records:
for alignment in record.alignments: #for description in record.descriptions???
for hsp in alignment.hsps: #for title in description.title???
#search for designated string
search = re.search(search_string, alignment.title)
#if search comes up with nothing, end
if search is None:
print ("Search string not found.")
break
#if search comes up with something, add it to a list of entries that match search string
else:
#option to include an 'exception' (if it finds keyword then DOES NOT add that entry to list)
if search is "trichomonas" or "entamoeba" or "arabidopsis":
print ("found exception.")
break
else:
query_list.append(record.query)
hit_list.append(alignment.title)
expect_list.append(expect_val)
length_list.append(length)
#explicitly convert 'variables' ['int' object or 'float'] to strings
length = str(alignment.length)
expect_val = str(hsp.expect)
#print ("\nquery name: " + record.query)
#print ("alignment title: " + alignment.title)
#print ("alignment length: " + length)
#print ("expect value: " + expect_val)
#print ("\n***Alignment***\n")
#print (hsp.query)
#print (hsp.match)
#print (hsp.sbjct + "\n\n")
if query_len is not hit_len is not expect_len is not length_len:
print ("list lengths don't match!")
break
else:
qrylen = len(query_list)
query_len = str(qrylen)
hitlen = len(hit_list)
hit_len = str(hitlen)
expectlen = len(expect_list)
expect_len = str(expectlen)
lengthlen = len(length_list)
length_len = str(lengthlen)
outpath = str(outfile)
#create new file
outfile = open("__Blast_Parse_Search.txt", "w")
outfile.write("File contains entries from [" + outpath + "] that contain [" + search_string + "]")
outfile.close
#write list to file
i = 0
list_len = int(query_len)
for i in range(0, list_len):
#append new file
outfile = open("__Blast_Parse_Search.txt", "a")
outfile.writelines(query_list + hit_list + expect_list + length_list)
i = i + 1
#write to disk, close file
outfile.flush()
outfile.close
print ("query list length " + query_len)
print ("hit list length " + hit_len)
print ("expect list length " + expect_len)
print ("length list length " + length_len + "\n\n")
print ("first record: " + query_list[0] + " " + hit_list[0] + " " + expect_list[0] + " " + length_list[0])
print ("last record: " + query_list[-1] + " " + hit_list[-1] + " " + expect_list[-1] + " " + length_list[-1])
print ("\nFinished.\n")
If I understand your problem correctly you could use a default value for the line slippage thing like:
try:
x(list)
except exception:
append_default_value(list)
http://docs.python.org/tutorial/errors.html#handling-exceptions
or use tuples for dictionary keys like (0,1,1) and use the get method for your default value.
http://docs.python.org/py3k/library/stdtypes.html#mapping-types-dict
If you need to maintain data structures in your output files you might try using shelve:
or you could append some type of reference after each record and give each record a unique id for example '#32{somekey:value}#21#22#44#'
again you can have multiple keys using a tuple.
I don't know if that helps, you might clarify exactly what parts of your code you have trouble with. Like x() gives me output y but I expect z.