How to slice a very long string in python - python

I need to slice a very long string (DNA sequences) in python, currently I'm using this:
new_seq = clean_seq[start:end]
I'm slicing about every 20000 characters, and taking 1000 long slices (approximately)
it's a 250MB file containing a few strings, identified each one with an id, this method is taking too long.
The sequence string comes from biopython module:
def fasta_from_ann(annotation, sequence, feature, windows, output_fasta):
df_gff = pd.read_csv(annotation, index_col=False, sep='\t',header=None)
df_gff.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
fasta_seq = SeqIO.parse(sequence,'fasta')
buffer = []
for record in fasta_seq:
df_exctract = df_gff[(df_gff.seqname == record.id) & (df_gff.feature == feature)]
for k,v in df_exctract.iterrows():
clean_seq = ''.join(str(record.seq).splitlines())
if int(v.start) - windows < 0:
start = 0
else:
start = int(v.start) - windows
if int(v.end) + windows > len(clean_seq):
end = len(clean_seq)
else:
end = int(v.end) + windows
new_seq = clean_seq[start:end]
new_id = record.id + "_from_" + str(v.start) + "_to_" + str(v.end) + "_feature_" + v.feature
desc = "attribute: " + v.attribute + " strand: " + v.strand
seq = SeqRecord(Seq(new_seq), id=new_id,description = desc)
buffer.append(seq)
print(record.id)
SeqIO.write(buffer, output_fasta, "fasta")
Maybe there's a more memory-friendly way to accomplish this.

Related

Openpyxl module: return weird value(not error) + hope to calculate

I wrote some codes trying to let the user be able to check the percentage of the money they spent(compared to the money they earned). Almost every step perform normally, until the final part.
a_c[('L'+row_t)].value return:
=<Cell 'Sheet1'.B5>/<Cell 'Sheet1'.J5>
yet I hope it should be some value.
Code:
st_column = st_column_r.capitalize()
row_s = str(a_c.max_row)
row_t = str(a_c.max_row + 1)
row = int(row_t)
a_c[('J'+row_t)] = ('=SUM(I2,J'+row_s+')') #總收入
errorprevention = a_c[('J'+row_t)].value
a_c[(st_column+row_t)] = ('=SUM('+(st_column+'2')+','+(st_column+row_s)+')')
a_c['L'+row_t].number_format = FORMAT_PERCENTAGE_00
if errorprevention != 0:
a_c[('L'+row_t)] = ('='+str(a_c[(st_column+row_t)])+'/'+str(a_c[('J'+row_t)]))
print('過往支出中,'+inputtype[st_column]+'類別佔總收入的比率為:'+a_c[('L'+row_t)].value)
Try changing the formula creation to;
a_c[('L' + row_t)].value = '=' + a_c[(st_column + row_t)].coordinate + '/' + a_c[('J' + row_t)].coordinate
or use an f string
a_c[('L' + row_t)].value = f"={a_c[(st_column + row_t)].coordinate}/{a_c[('J' + row_t)].coordinate}"

How do I improve my code to make it run faster?

I am conducting a project in data science to analyse large volumes of cancer genome data, my computer is relatively inefficient and has a low cpu and low ram. As a result to run through all the samples it take sufficiently too long.
I have tried reducing any excess code, I have tried getting rid of for loops for list comprehensions, I have used multiprocessing to split up my tasks to run faster.
import re
import xlrd
import os
import time
from multiprocessing import Pool
import collections
import pandas as pd
if os.path.exists("C:\\Users\\js769\\genomemutations\\Input\\ChromosomesVersion") == True:
print("chromosomes in folder")
else:
os.makedirs("C:\\Users\\js769\\genomemutations\\Input\\ChromosomesVersion")
print(
"Chromosome Folder Created, Please transfer current version of chromosome number base data to new file."
)
if os.path.exists("C:\\Users\\js769\\genomemutations\\Input\\MutationSamples") == True:
print("Add sample data to run.")
else:
os.makedirs("C:\\Users\\js769\\genomemutations\\Input\\MutationSamples")
print("Mutation Sample Folder Created, please add mutation sample data to folder.")
if os.path.exists("C:\\Users\\js769\\genomemutations\\output") == True:
print("3")
else:
os.makedirs("C:\\Users\\js769\\genomemutations\\output")
# Require editing of this so it works both on a mac or windows system. Currently this version suited to mac because of higher processing power.
# Require ability to check to see if error occurs
def Main(Yeram):
import os
import glob
import errno
import shutil
import xlrd
import pandas as pd
import time
import re
import numpy as np
FragmentSize = 10000000 # This is fragment size which is adjustable.
# Code not needed
Position1 = Yeram.vectx
Position2 = Yeram.vecty
samplelist = Yeram.samplelist
dictA = Yeram.dictA
FragmentSize = Yeram.FragmentSize
chromosomesizes = Yeram.chromosomesizes
def chromosomex_mutation_data(
chromosomenumber, mutationlist
): # It selects the correct chromosome mutation point data, then it selects the data before the -. Mutation data in form(12-20)
chromosomexlist = ["0-1"]
for mutationposition in mutationlist:
if mutationposition[0:2] == str(chromosomenumber):
chromosomexlist.append(mutationposition[3:])
elif mutationposition[0:2] == (str(chromosomenumber) + ":"):
chromosomexlist.append(mutationposition[2:])
else:
continue
Puremutationdatapoints = [int(mutationposition.split("-")[0]) for mutationposition in chromosomexlist]
return Puremutationdatapoints
def Dictionary_Of_Fragment_mutation(FragmentSize, MutationData, ChromosomeNumber): #
chromosomes = {} # Dictionary
chromosomesize = chromosomesizes[ChromosomeNumber - 1]
# Opening up specific chromosome data and calculating amount of bases present in chromosome
Number_of_fragments = int(chromosomesize / FragmentSize)
for mutation in MutationData:
for i in range(0, (Number_of_fragments), 1):
a = (
"Chromosome"
+ str(ChromosomeNumber)
+ "Fragment"
+ str(i)
+ ",Basepairs "
+ str(i * FragmentSize + 1)
+ "-"
+ str(i * FragmentSize + FragmentSize)
)
if mutation in range(i * FragmentSize + 1, i * FragmentSize + FragmentSize + 1):
if chromosomes.get(a) == None:
chromosomes.update({a: 1})
else:
b = (chromosomes.get(a)) + 1
chromosomes.update({a: b})
else:
if chromosomes.get(a) == None:
chromosomes.update({a: 0})
else:
continue
return chromosomes # adds
# This adds mutations or no mutation to each fragment for chromosome,makes dicitonaries
def DictionaryRead(FragmentSize, Dict, ChromosomeNumber):
chromosomesize = chromosomesizes[ChromosomeNumber - 1]
Number_of_fragments = int(chromosomesize / FragmentSize)
chromosomefragmentlist = []
for i in range(0, (Number_of_fragments), 1):
a = (
"Chromosome"
+ str(ChromosomeNumber)
+ "Fragment"
+ str(i)
+ ",Basepairs "
+ str(i * FragmentSize + 1)
+ "-"
+ str(i * FragmentSize + FragmentSize)
)
chromosomefragmentlist.append(str(Dict.get((a))))
return chromosomefragmentlist
# This uses dictionary to create list
def forwardpackage2(FragmentSize, PureMutationData):
C = [] # list of data in numerical order 0 = no mutation
for i in range(1, 23, 1):
A = chromosomex_mutation_data(i, PureMutationData) # Purifies Data
B = Dictionary_Of_Fragment_mutation(FragmentSize, A, i) # Constructs Dictionary
C += DictionaryRead(
FragmentSize, B, i
) # Uses constructed Dictionary amd generates list of numbers, each number being a fragment in numerical order.
return C
def Mutationpointdata(Position1, Position2, dictA, FragmentSize): # Require dictA
vectx = Position1
vecty = Position2
Samplesandmutationpoints = []
for i in range(vectx, vecty):
print(samplelist[i])
new = [k for k, v in dictA.items() if int(v) == samplelist[i]]
mutationlist = [excelsheet.cell_value(i, 23) for i in new]
mutationlist.sort()
Samplesandmutationpoints.append(forwardpackage2(FragmentSize, mutationlist))
return Samplesandmutationpoints
# Opening sample data from excel table
return Mutationpointdata(Position1, Position2, dictA, FragmentSize) # yeram to james samples
def ChromosomeSequenceData(ChromosomeNumber): # Formats the chromosome file into readable information
with open(
r"C:\Users\js769\genomemutations\Input\ChromosomesVersion\chr" + str(ChromosomeNumber) + ".fa"
) as text_file:
text_data = text_file.read()
listA = re.sub("\n", "", text_data)
# list2=[z for z in text_data if z!= "\n"]
if ChromosomeNumber < 10:
ChromosomeSequenceData = listA[5:]
else:
ChromosomeSequenceData = listA[6:]
return ChromosomeSequenceData
def basepercentage_single(
i, FragmentSize, ChromosomeSequenceData
): # Creates a list of base percentage known for certain type of chromosome.
sentence = ChromosomeSequenceData[(i * FragmentSize + 1) : (i * FragmentSize + FragmentSize)]
a = sentence.count("N") + sentence.count("n")
c = str(((FragmentSize - a) / FragmentSize) * 100) + "%"
return c
def basepercentage_multiple(
FragmentSize, ChromosomeSequenceData
): # Creates a a list of base percentages known which correspond with the dna fragments for every chromosome.
fragmentamount = int(len(ChromosomeSequenceData) / FragmentSize)
list = [
basepercentage_single(i, FragmentSize, ChromosomeSequenceData) for i in range(0, (fragmentamount), 1)
]
return list
def FragmentEncodedPercentage(
FragmentSize
): # Packages a list of base percentages known which correspond with the dna fragments for every chromosome.
Initial_list = [basepercentage_multiple(FragmentSize, ChromosomeSequenceData(i)) for i in range(1, 23, 1)]
List_of_fragment_encoded_percentages = [item for sublist in Initial_list for item in sublist]
return List_of_fragment_encoded_percentages
def chromosomefragmentlist(
FragmentSize, ChromosomeNumber
): # Creares a list of fragment sizes for a specific chromosome.
chromosomesize = chromosomesizes[ChromosomeNumber - 1]
Number_of_fragments = int(chromosomesize / FragmentSize)
chromosomefragmentlist = []
for i in range(0, (Number_of_fragments), 1):
a = (
"Chromosome"
+ str(ChromosomeNumber)
+ "Fragment"
+ str(i)
+ ",Basepairs "
+ str(i * FragmentSize + 1)
+ "-"
+ str(i * FragmentSize + FragmentSize)
)
chromosomefragmentlist.append(str(((a))))
return chromosomefragmentlist
def GenomeFragmentGenerator(
FragmentSize
): # Creates the genome fragments for all chromosomes and adds them all to a list.
list = [chromosomefragmentlist(FragmentSize, i) for i in range(1, 23, 1)]
A = [item for sublist in list for item in sublist]
return A
def excelcreation(
mutationdata, samplelist, alpha, bravo, FragmentSize, A, B
): # Program runs sample alpha to bravo and then constructs excel table
data = {"GenomeFragments": A, "Encoded Base Percentage": B}
for i in range(alpha, bravo):
data.update({str(samplelist[i]): mutationdata[i]})
df = pd.DataFrame(data, index=A)
export_csv = df.to_csv(
r"C:/Users/js769/genomemutations/output/chromosomeAll.csv", index=None, header=True
)
start_time = time.time()
# Code determine base fragment size
FragmentSize = 1000000
chromosomesizes = [] # This calculates the base pair sizes for each chromosome.
for i in range(1, 23):
with open(r"C:\Users\js769\genomemutations\Input\ChromosomesVersion\chr" + str(i) + ".fa") as text_file:
text_data = text_file.read()
list = re.sub("\n", "", text_data)
if i < 10:
chromosomesizes.append(len(list[5:]))
else:
chromosomesizes.append(len(list[6:]))
wb = xlrd.open_workbook("C:/Users/js769/genomemutations/input/MutationSamples/Complete Sample For lungs.xlsx")
excelsheet = wb.sheet_by_index(0)
excelsheet.cell_value(0, 0)
sampleswithduplicates = [excelsheet.cell_value(i, 5) for i in range(1, excelsheet.nrows)]
samplelist = []
for sample in sampleswithduplicates:
if sample not in samplelist:
samplelist.append(int(sample)) # Constructs list of sample , each sample only comes up once
dictA = {}
counter = 1 # Creates a dictionary where it counts the
for sample in sampleswithduplicates:
dictA.update({counter: int(sample)})
counter = counter + 1
A = GenomeFragmentGenerator(FragmentSize)
B = FragmentEncodedPercentage(FragmentSize)
value = collections.namedtuple(
"value", ["vectx", "vecty", "samplelist", "dictA", "FragmentSize", "chromosomesizes"]
)
SampleValues = (
value(
vectx=0,
vecty=2,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=2,
vecty=4,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=4,
vecty=6,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=6,
vecty=8,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=8,
vecty=10,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=10,
vecty=12,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=12,
vecty=14,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=14,
vecty=16,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
)
print("starting multiprocessing")
if __name__ == "__main__":
with Pool(4) as p:
result = p.map(Main, SampleValues)
Allmutationdata = []
for i in result:
for b in i:
Allmutationdata.append(b)
excelcreation(Allmutationdata, samplelist, 0, 16, FragmentSize, A, B)
print("My program took " + str(time.time() - start_time) + " to run")
So the program runs that isn't the issue, the issue is the time it runs,can anyone spot anywhere where my code maybe at fault.
This article How to make your pandas loop run 72,000x faster has really resonated with me and I think will help you.
It provides clear instructions on how to vectorize your for loops to drastically speed them up
Methods to speed up a For Loop:
Utilize pandas iterrows()
~321 times faster
Example
for index, row in dataframe.iterrows():
print(index, row)
Pandas Vectorization
~9280 times faster
Example
df.loc[((col1 == val1) & (col2 == val2)), column_name] = conditional_result
Numpy Vectorization
~72,000 times faster
Example
df.loc[((col1.values == val1) & (col2.values == val2)), column_name] = conditional_result
By adding .values we receive a numpy array.
Credit for the timing results goes to this article

Problems with calculating gcskews using python

everyone.
I have some problems with calculating gcskews in python.
My 2 major inputs are fasta file and bed file.
Bed file has columns of gn(0), gene_type(1), gene name(2), chromosome(3), strand(4), num(5), start(6).(These numbers are index numbers in python.) Then I am trying to use some functions which can calculate gcskews of sense and antisense strand from the start site of each gene. The window is 100bp and these are the functions.
import re
import sys
import os
# opening bed file
content= []
with open("gene_info.full.tsv") as new :
for line in new :
content.append(line.strip().split())
content = content[1:]
def fasta2dict(fil):
dic = {}
scaf = ''
seq = []
for line in open(fil):
if line.startswith(">") and scaf == '':
scaf = line.split(' ')[0].lstrip(">").replace("\n", "")
elif line.startswith(">") and scaf != '':
dic[scaf] = ''.join(seq)
scaf = line.split(' ')[0].lstrip(">").replace("\n", "")
seq = []
else:
seq.append(line.rstrip())
dic[scaf] = ''.join(seq)
return dic
dic_file = fasta2dict("full.fa")
# functions for gc skew
def GC_skew_up(strand, loc, seq, window = 100) : # need -1 for index
values_up = []
loc = loc - 1
if strand == "+" :
sp_up = seq[loc - window : loc]
g_up = sp_up.count('G') + sp_up.count('g')
c_up = sp_up.count('C') + sp_up.count('c')
try :
skew_up = (g_up - c_up) / float(g_up + c_up)
except ZeroDivisionError:
skew_up = 0.0
values_up.append(skew_up)
elif strand == "-" :
sp_up = seq[loc : loc + window]
g_up = sp_up.count('G') + sp_up.count('g')
c_up = sp_up.count('C') + sp_up.count('c')
try :
skew_up = (c_up - g_up) / float(g_up + c_up)
except ZeroDivisionError:
skew_up = 0.0
values_up.append(skew_up)
return values_up
def GC_skew_dw(strand, loc, seq, window = 100) :
values_dw = []
loc = loc - 1
if strand == "+" :
sp_dw = seq[loc : loc + window]
g_dw = sp_dw.count('G') + sp_dw.count('g')
c_dw = sp_dw.count('C') + sp_dw.count('c')
try :
skew_dw = (g_dw - c_dw) / float(g_dw + c_dw)
except ZeroDivisionError:
skew_dw = 0.0
values_dw.append(skew_dw)
elif strand == "-" :
sp_dw = seq[loc - window : loc]
g_dw = sp_dw.count('G') + sp_dw.count('g')
c_dw = sp_dw.count('C') + sp_dw.count('c')
try :
skew_dw = (c_dw - g_dw) / float(g_dw + c_dw)
except ZeroDivisionError:
skew_dw = 0.0
values_dw.append(skew_dw)
return values_dw
As I said, I want to calculate the gcskews for 100bp of strands from the start site of genes.
Therefore, I made codes that get the chromosome name from the bed file and get the sequence data from the Fasta file.
Then according to gene name and strand information, I expected that codes will find the correct start site and gcskew for 100bp window will be calculated.
However, when I run this code, gcskew of - strand is wrong but + strand is correct. (I got correct gcskew data and I used it.)
Gcskews are different from the correct data, but I don't know what is the problem.
Could anyone tell me what is the problem of this code?
Thanks in advance!
window = 100
gname = []
up = []
dw = []
for match in content :
seq_chr = dic_file[str(match[3])]
if match[4] == "+" :
strand = match[4]
new = int(match[6])
sen_up = GC_skew_up(strand, new, seq_chr, window = 100)
sen_dw = GC_skew_dw(strand, new, seq_chr, window = 100)
gname.append(match[2])
up.append(str(sen_up[0]))
dw.append(str(sen_dw[0]))
if match[4] == "-" :
strand = match[4]
new = int(match[6])
an_up = GC_skew_up(strand, new, seq_chr, window = 100)
an_dw = GC_skew_dw(strand, new, seq_chr, window = 100)
gname.append(match[2])
up.append(str(an_up[0]))
dw.append(str(an_dw[0]))
tot = zip(gname, up, dw)

how to compare two strings in pandas large dataframe (python3.x)?

I have two DFs from 2 excel files.
1st file(awcProjectMaster)(1500 records)
projectCode projectName
100101 kupwara
100102 kalaroos
100103 tangdar
2nd file(village master)(more than 10 million records)
villageCode villageName
425638 wara
783651 tangdur
986321 kalaroo
I need to compare the projectName and villageName along with the percentage match.
The following code works fine but it is slow. How can I do the same thing in a more efficient way.
import pandas as pd
from datetime import datetime
df = pd.read_excel("C:\\Users\\Desktop\\awcProjectMaster.xlsx")
df1 = pd.read_excel("C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.xlsx")
def compare(prjCode, prjName, stCode, stName, dCode, dName, sdCode, sdName, vCode, vName):
with open(r"C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.txt", "a") as f:
percentMatch = 0
vLen = len(vName)
prjLen = len(prjName)
if vLen > prjLen:
if vName.find(prjName) != -1:
percentMatch = (prjLen / vLen) * 100
f.write(prjCode + "," + prjName + "," + vCode + "," + vName + "," + str(round(percentMatch)) + "," + stCode + "," + stName + "," + dCode + "," + dName + sdCode + "," + sdName + "\n")
else:
res = 0
# print(res)
elif prjLen >= vLen:
if prjName.find(vName) != -1:
percentMatch = (vLen / prjLen) * 100
f.write(prjCode + "," + prjName + "," + vCode + "," + vName + "," + str(round(percentMatch)) + "," + stCode + "," + stName + "," + dCode + "," + dName + sdCode + "," + sdName + "\n")
else:
res = 0
# print(res)
f.close()
for idx, row in df.iterrows():
for idxv, r in df1.iterrows():
compare(
str(row["ProjectCode"]),
row["ProjectName"].lower(),
str(r["StateCensusCode"]),
r["StateName"],
str(r["DistrictCode"]),
r["DistrictName"],
str(r["SubDistrictCode"]),
r["SubDistrictNameInEnglish"],
str(r["VillageCode"]),
r["VillageNameInEnglish"].lower(),
)
Your distance metric for the strings isn't too accurate, but if it works for you, fine. (You may want to look into other options like the builtin difflib, or the Python-Levenshtein module, though.)
If you really do need to compare 1,500 x 10,000,000 records pairwise, things are bound to take some time, but there are a couple things that we can do pretty easily to speed things up:
open the log file only once; there's overhead, sometimes significant, in that
refactor your comparison function into a separate unit, then apply the lru_cache() memoization decorator to make sure each pair is compared only once, and the subsequent result is cached in memory. (In addition, see how we sort the vName/prjName pair – since the actual order of the two strings doesn't matter, we end up with half the cache size.)
Then for general cleanliness,
use the csv module for streaming CSV into a file (the output format is slightly different than with your code, but you can change this with the dialect parameter to csv.writer()).
Hope this helps!
import pandas as pd
from datetime import datetime
from functools import lru_cache
import csv
df = pd.read_excel("C:\\Users\\Desktop\\awcProjectMaster.xlsx")
df1 = pd.read_excel("C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.xlsx")
log_file = open(r"C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.txt", "a")
log_writer = csv.writer(log_file)
#lru_cache()
def compare_vname_prjname(vName, prjName):
vLen = len(vName)
prjLen = len(prjName)
if vLen > prjLen:
if vName.find(prjName) != -1:
return (prjLen / vLen) * 100
elif prjLen >= vLen:
if prjName.find(vName) != -1:
return (vLen / prjLen) * 100
return None
def compare(prjCode, prjName, stCode, stName, dCode, dName, sdCode, sdName, vCode, vName):
# help the cache decorator out by halving the number of possible pairs:
vName, prjName = sorted([vName, prjName])
percent_match = compare_vname_prjname(vName, prjName)
if percent_match is None: # No match
return False
log_writer.writerow(
[
prjCode,
prjName,
vCode,
vName,
round(percent_match),
stCode,
stName,
dCode,
dName + sdCode,
sdName,
]
)
return True
for idx, row in df.iterrows():
for idxv, r in df1.iterrows():
compare(
str(row["ProjectCode"]),
row["ProjectName"].lower(),
str(r["StateCensusCode"]),
r["StateName"],
str(r["DistrictCode"]),
r["DistrictName"],
str(r["SubDistrictCode"]),
r["SubDistrictNameInEnglish"],
str(r["VillageCode"]),
r["VillageNameInEnglish"].lower(),
)

Perfomance improvement - Looping with Get Method

I've built a program to fill up a databank and, by the time, it's working. Basically, the program makes a request to the app I'm using (via REST API) returns the data I want and then manipulate to a acceptable form for the databank.
The problem is: the GET method makes the algorithm too slow, because I'm acessing the details of particular entries, so for each entry I have to make 1 request. I have something close to 15000 requests to do and each row in the bank is taking 1 second to be made.
Is there any possible way to make this requests faster? How can I improve the perfomance of this method? And by the way, any tips to measure the perfomance of the code?
Thanks in advance!!
here's the code:
# Retrieving all the IDs I want to get the detailed info
abc_ids = serializers.serialize('json', modelExample.objects.all(), fields=('id'))
abc_ids = json.loads(abc_ids)
abc_ids_size = len(abc_ids)
# Had to declare this guys right here because in the end of the code I use them in the functions to create and uptade the back
# And python was complaining that I stated before assign. Picked random values for them.
age = 0
time_to_won = 0
data = '2016-01-01 00:00:00'
# First Loop -> Request to the detailed info of ABC
for x in range(0, abc_ids_size):
id = abc_ids[x]['fields']['id']
url = requests.get(
'https://api.example.com/v3/abc/' + str(
id) + '?api_token=123123123')
info = info.json()
dealx = dict(info)
# Second Loop -> Picking the info I want to uptade and create in the bank
for key, result in dealx['data'].items():
# Relevant only for ModelExample -> UPTADE
if key == 'age':
result = dict(result)
age = result['total_seconds']
# Relevant only For ModelExample -> UPTADE
elif key == 'average_time_to_won':
result = dict(result)
time_to_won = result['total_seconds']
# Relevant For Model_Example2 -> CREATE
# Storing a date here to use up foward in a datetime manipulation
if key == 'add_time':
data = str(result)
elif key == 'time_stage':
# Each stage has a total of seconds that the user stayed in.
y = result['times_in_stages']
# The user can be in any stage he want, there's no rule about the order.
# But there's a record of the order he chose.
z = result['order_of_stages']
# Creating a list to fill up with all stages info and use in the bulk_create.
data_set = []
index = 0
# Setting the number of repititions base on the number of the stages in the list.
for elemento in range(0, len(z)):
data_set_i = {}
# The index is to define the order of the stages.
index = index + 1
for key_1, result_1 in y.items():
if int(key_1) == z[elemento]:
data_set_i['stage_id'] = int(z[elemento])
data_set_i['index'] = int(index)
data_set_i['abc_id'] = id
# Datetime manipulation
if result_1 == 0 and index == 1:
data_set_i['add_date'] = data
# I know that I totally repeated the code here, I was trying to get this part shorter
# But I could not get it right.
elif result_1 > 0 and index == 1:
data_t = datetime.strptime(data, "%Y-%m-%d %H:%M:%S")
data_sum = data_t + timedelta(seconds=result_1)
data_sum += timedelta(seconds=3)
data_nova = str(data_sum.year) + '-' + str(formaters.DateNine(
data_sum.month)) + '-' + str(formaters.DateNine(data_sum.day)) + ' ' + str(
data_sum.hour) + ':' + str(formaters.DateNine(data_sum.minute)) + ':' + str(
formaters.DateNine(data_sum.second))
data_set_i['add_date'] = str(data_nova)
else:
data_t = datetime.strptime(data_set[elemento - 1]['add_date'], "%Y-%m-%d %H:%M:%S")
data_sum = data_t + timedelta(seconds=result_1)
data_sum += timedelta(seconds=3)
data_nova = str(data_sum.year) + '-' + str(formaters.DateNine(
data_sum.month)) + '-' + str(formaters.DateNine(data_sum.day)) + ' ' + str(
data_sum.hour) + ':' + str(formaters.DateNine(data_sum.minute)) + ':' + str(
formaters.DateNine(data_sum.second))
data_set_i['add_date'] = str(data_nova)
data_set.append(data_set_i)
Model_Example2_List = [Model_Example2(**vals) for vals in data_set]
Model_Example2.objects.bulk_create(Model_Example2_List)
ModelExample.objects.filter(abc_id=id).update(age=age, time_to_won=time_to_won)
if the bottleneck is in your network request, there isn't much you can do except to perhaps use gzip or deflate but with requests ..
The gzip and deflate transfer-encodings are automatically decoded for
you.
If you want to be doubly sure, you can add the following headers to the get request.
{ 'Accept-Encoding': 'gzip,deflate'}
The other alternative is to use threading and have many requests operate in parrallel, a good option if you have lot's of bandwidth and multiple cores.
Lastly, there are lots of different ways to profile python including with cprofile + kcachegrind combo.

Categories

Resources