Below is the code I am working with and although there are no errors in the console output it still doesn't return anything there. Why is there no output with the code? Is there anything I need to change or fix. I think I imported the scanner correctly but there may be an issue with that. Thank you!
import sys
from scanner import Scanner
from functools import cmp_to_key
class Tweet:
def __init__(self, tweeter, tweet, time):
self.tweeter = tweeter[1:]
self.tweet = tweet
self.time = time
def __str__(self):
return self.tweeter+" "+self.time
def display(self):
return self.tweeter+" " +self.tweet
def create_record(s):
tweets = []
tweeter = s.readtoken()
tweet1count = 0
while tweeter != "":
tweet = s.readstring()
t1 = Tweet(tweeter, tweet, s.readline())
tweet1count += 1
tweeter = s.readtoken()
tweets.append(t1)
return tweets
def read_records(file):
s = Scanner(file)
return create_record(s)
def is_more_recent(t1, t2):
year, month, day, time = t1.time.split()
month = ("0" + month)[-2:]
day = ("0" + day)[-2:]
timestamp1 = year + month + day + time
year, month, day, time = t2.time.split()
month = ("0" + day)[-2:]
day = ("0" + day)[-2:]
timestamp2 = year + month + day + time
return timestamp1 > timestamp2
def merge_and_sort_tweets(tweets1, tweets2):
tweets = tweets1 + tweets2
cmp_items_py3 = cmp_to_key(is_more_recent)
tweets.sort(key = cmp_items_py3)
return tweets
def write_records(file, tweets):
file = open(file, "w+")
for t in tweets:
file.write(str(t))
def main():
print("Reading Files")
tweets1 = read_records(sys.argv[1])
tweets2 = read_records(sys.argv[2])
tweet1count = len(tweets1)
tweet2count = len(tweets2)
if tweet1count > tweet2count:
print("tweet1.txt contained the most tweets with" , tweet1count)
elif tweet1count < tweet2count:
print("tweet2.txt contained the most tweets with" , tweet2count)
else:
print("tweet1.txt contains ", tweet1count, "tweets. ")
print("tweet2.txt contains ", tweet2count, "tweets. ")
print("Merging files...")
tweets = merge_and_sort_tweets(tweets1, tweets2)
print("Writing file...")
write_records(sys.argv[3], tweets)
print("File writtem. Displying", min(5, len(tweets)),"earliest tweeters and tweets.")
for i in range(min(5, len(tweets))):
print(tweets[i].display())
if __name__ == "__main__":
main()
Related
I'm new to coding and cant figure out where i'm breaking. The ValueError keeps coming up but i cant seem to figure out where i'm breaking
def sunset(date,daycycle):
sunset_date_time = ''
year = date.strftime("%Y")
year_data = daycycle.get(year)
if(year_data != None):
month_day = date.strftime("%m-%d")
result_set = year_data.get(month_day)
if(result_set != None):
sunset_time = result_set["sunset"]
sunset_date_time = year + "-" + month_day + " " + sunset_time
return datetime.datetime.strptime(sunset_date_time, "%Y-%m-%d %H:%M")
This error is caused by the date format of the variable "sunset_date_time"
When you try to return the object this variable not have the date format as "%Y-%m-%d %H:%M"
To see what format have you can try print this value or return from the function and check the order of year, month, day , hour and minutes
def sunset(date,daycycle):
sunset_date_time = ''
year = date.strftime("%Y")
year_data = daycycle.get(year)
if(year_data != None):
month_day = date.strftime("%m-%d")
result_set = year_data.get(month_day)
if(result_set != None):
sunset_time = result_set["sunset"]
sunset_date_time = year + "-" + month_day + " " + sunset_time
print(sunset_date_time)
"""
or return sunset_date_time
"""
I have a python code to parse a 1 TB log file, but the problem is my result is shown after the parsing process is finished. So for that I need to wait for 12 hours, and after 12 hours only then the result is shown. I want to know how can I parse a log file and know the result of the parsing speed every 10 seconds.
This is my code:
import re
import timeit
log_file = '/Users/kiya/Desktop/mysql/ipscan/ip.txt'
output_file ='/Users/kiya/Desktop/mysql/ipscan/k2u.csv'
name_to_check = 'MBX_AUTHENTICATION_FAILED'
class Log_minning:
def __init__(self):
self.counter = 0
def get_userdata(self):
user_att = []
list_usr = []
counterr = 0
with open(log_file, encoding='utf-8') as infile:
for line in infile:
if name_to_check in line:
username = re.search(r'(?<=userName=)(.*)(?=,)', line)
username = username.group()
date = re.search(r"([12]\d{3}(0[1-9]|1[0-2])+"
"(0[1-9]|[12]\d|3[01]))", line)
date = date.group()
time = re.search(r"(\d{9}\+\d{4})", line)
time = time.group()
ip = re.search(
r'(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.)'
'{3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])',
line
)
ip = ip.group()
user_att.append(username)
user_att.append(date)
user_att.append(time)
user_att.append(ip)
list_usr.append(user_att)
counterr = counterr + 1
self.counter = counterr
return list_usr
if __name__ == "__main__":
lm = Log_minning()
the_time = timeit.Timer(lm.get_userdata).repeat(1, 1000)
sing_time = min(the_time)/1000
speed = 600 / sing_time * lm.counter
# for line in lm.get_userdata():
# print(line)
print(
"Processing " + str(lm.counter) + " in " + str(the_time) +
"\nThe speed aproximately " + str(speed) + " data in 10 sec"
)
This is the fully scanned seconds
Processing 117 in [6.646515152002394]
My function is to read data from a file that consists of dates with times a tweet was written, and sentiments (good, bad or neutral) it's classified as; select date with times, and sentiments between a start and end date; and finally create three dictionaries (positive, negative and neutral) that use the date as key, and number of positive, negative or neutral tweets made in a day.
The problems I have are:
a) How do I get only date to display, and not date and time?.
b) How do I get my program to include both start and end date?
c) How do I separate a key and value with a semi-colon in a dictionary?
def get_sentiment_dates(start_date, end_date):
positive_dict = {}
negative_dict = {}
neutral_dict = {}
f = open("BAC2_answer.csv", "r")
tweets = f.readlines()
bin_use =[]
bin_trash =[]
bin_use_senti = []
bin_trash_senti = []
start_date_obj = datetime.strptime(start_date, '%Y-%m-%d')
end_date_obj = datetime.strptime(end_date, '%Y-%m-%d')
for i in tweets:
specs = i.split(',')
t_and_d = specs[0]
dt_obj = datetime.strptime(t_and_d, "%Y-%m-%d %H:%M:%S")
chars_body = specs[1].strip()
if ((dt_obj >= start_date_obj) and dt_obj <= (end_date_obj)):
bin_use.append(dt_obj)
bin_use_senti.append(chars_body)
else:
bin_trash.append(dt_obj)
bin_trash_senti.append(chars_body)
num_of_pos = 0
num_of_neg = 0
num_of_neut = 0
for i,j in zip(bin_use, bin_use_senti):
if j == 'Bullish':
num_of_pos +=1
positive_dict = (i, num_of_pos)
elif j == 'Bearish':
num_of_neg+=1
negative_dict = (i, num_of_neg)
else:
num_of_neut+=1
neutral_dict = (i, num_of_neut)
# print str(positive_dict) + "," + str(negative_dict) + "," + str(neutral_dict)
f.close()
return [positive_dict,negative_dict,neutral_dict]
I have written a script that uses pool.map to process multiple netCDF files and store information in a table. Each process runs a function to process one year. Each year has it's own individual file geodatabase, table within that geodatabase, and mxd. I also set the default workspace and scratch workspace to that geodatabase. For example when the function loads the year 1979 it accesses the 1979 geodatabase, 1979 table within that geodatabase, and 1979 mxd. 1980 would access the 1980 geodatabase, 1970 table within that geodatabase, and 1980 mxd.
If I run 1 process everything works fine. If I try to run 2 or more I get Fatal Error (INFADI) Missing Directory. Right now I'm running 6 processes. 4 Crash and the other 2 keep going without a problem.
Here is the code:
# Multiprocessing netCDF data into a table
######################################
import arcpy, calendar, datetime, numpy, multiprocessing, sys, re, timeit, os
from arcpy.sa import *
#Receive day and year and return the date str in MM/DD/YYYY
def getDate(day, year):
date = datetime.datetime(year, 1, 1) + datetime.timedelta(day)
date = date.timetuple()
date = str(date[1]) + '/' + str(date[2]) + '/' + str(date[0])
return date
#Main loop
#Receive a year int and process all dates within "good" months
def doCalc(year):
yearstr = str(year)
print('Starting doCalc: ' + yearstr)
############################################
#### CHANGE THIS INPUT ####
Species = 'Mallard'
Regiondb = 'North_America' #Spaces not allowed in filename to map
Region = 'Duck Zone' #Spaces allowed in DB
regionField = 'ZONE_NAME'
############################################
defaultGDB = "D:\\GIS\projects\\LCC_WSI_Climate\\year" + yearstr + ".gdb"
#Setting environmental variables
arcpy.env.workspace = defaultGDB
arcpy.env.scratchWorkspace = defaultGDB
arcpy.env.overwriteOutput = True
#desired months
goodmonth = (1, 2, 3, 9, 10, 11, 12)
#Acquire necessary extension and exit if it can't acquire
#Spatial Extension
try:
if arcpy.CheckExtension("Spatial") == "Available":
arcpy.CheckOutExtension("Spatial")
print("Acquired Spatial license")
else:
sys.exit("No Spatial Analyst license available")
except:
sys.exit("No Spatial Analyst license available")
#Geostats Extension
try:
if arcpy.CheckExtension("GeoStats") == "Available":
arcpy.CheckOutExtension("GeoStats")
print("Acquired GeoStats license")
else:
sys.exit("No GeoStats license available")
except:
sys.exit("No GeoStats license available")
#Try and except statements currently used for debugging and that is why the exceps are not specific.
try:
#Select map document and set up layers. Using a map document because NetCDFRasters aren't
#playing nice if not "living" in a document
print('Starting :' + yearstr)
start = timeit.default_timer()
mxd = arcpy.mapping.MapDocument("D:/GIS/projects/LCC_WSI_Climate/python code/WSI_maps"+yearstr+".mxd")
df = arcpy.mapping.ListDataFrames(mxd)[0]
#Set the table to write to according to the year received
for table in arcpy.mapping.ListTableViews(mxd):
if table.name == 'T'+yearstr:
WSITable = table
#Set the Clip layer according to the Region specified above
for dflayer in arcpy.mapping.ListLayers(mxd,"", df):
if dflayer.name == Region:
WSIClip = dflayer
if dflayer.name == 'wsi_Layer':
WSILayer = dflayer
#Set directory where netCDF files reside
direct = "D:/GIS/projects/LCC_WSI_Climate/python code/wsi/"
#Set netCDF file according to year received
inputLayer = direct +'wsi.' + yearstr + '.nc'
#If it's 1979 it starts in September.
if year == 1979:
startday = 243
else:
startday = 0
#Make sure the wsi_Layer is the correct file.
arcpy.MakeNetCDFRasterLayer_md(inputLayer, "wsi", "x", "y", "wsi_Layer")
#Checks if the current year is a leap year
if calendar.isleap(year):
maxday = 366
else:
maxday = 365
#Cycle through every day within the year
for daycnt in range(startday, maxday):
day = 0
sendday = daycnt+1
date = getDate(daycnt, year)
newdate = datetime.datetime(year, 1, 1) + datetime.timedelta(daycnt)
newdate = newdate.timetuple()
month = newdate[1]
day = newdate[2]
#If the month is not desired it will skip the day and continue with the next day
if month not in goodmonth:
continue
datestr = str(month) + '/' + str(day) + '/' + str(year)
print(datestr)
#Use the Select by Dimension tool to change the netCDF layer to the current date
WSILayerRas = Raster("wsi_Layer")
arcpy.SelectByDimension_md(WSILayerRas, [["time", date]],"BY_VALUE")
#Save the file in defaultGDB. Processing didn't work without saving.
WSILayerRas.save("Temp"+yearstr)
##########################################
## Regions
##
wsikm = 0
datalist = []
#Calculate time
time = 'time ' + str(date)
#Setup the cursor to write to the output Table defined above (taken from mxd).
cursorout = arcpy.da.InsertCursor(WSITable, ("CATEGORY", "STATE", "SUBCATEGORY", "DATE","SQKM", "SPECIES"))
#Setup search cursor to go through the input dataset and clip raster to the shape of each feature.
#Copy data to the output table
with arcpy.da.SearchCursor(WSIClip,(regionField, "SHAPE#", "STATE_NAME")) as cursorin:
for row in cursorin:
AOIname = row[0]
AOIshape = row[1]
AOIextent = AOIshape.extent
AOIstate = row[2]
#dealing with odd characters and spaces
AOIname = re.sub("\s+", "", AOIname)
AOIname = AOIname.strip()
AOIname = AOIname.replace("'", "")
AOIname = AOIname.replace("/", "_")
AOIstatea = re.sub("\s+", "", AOIstate)
#print('State: ' + AOIstate + ', AOI: ' + AOIname)
savetemp = AOIstatea + '_' + AOIname + '_' + yearstr
#Process crashes running this try/except. The except doesn't catch it.
try:
deleteme = Raster(arcpy.gp.ExtractByMask_sa(WSILayerRas, AOIshape))
except:
continue
deleteme.save(savetemp)
#Add raster to an array for deletion later
datalist.append(deleteme)
#Convert the Extracted raster to a NumPy array and extract desired values
#by incrementing a counter and calculating area.
my_array = arcpy.RasterToNumPyArray(deleteme)
rows, cols = my_array.shape
countAOI = 0
wsikm = 0
#time calculation
for rowNum in xrange(rows):
for colNum in xrange(cols):
value = my_array.item(rowNum, colNum)
if value >= 7.2:
countAOI +=1
wsikm = countAOI * 1024
#write to the output Table
cursorout.insertRow((Region,AOIstate, AOIname, datestr, wsikm, Species))
#Cleanup the geodatabase
## print('Cleaning up')
arcpy.Delete_management(savetemp)
datasetList = arcpy.ListDatasets("Extract_W*", "Raster")
try:
for dataset in datasetList:
arcpy.Delete_management(dataset)
except:
continue
#attempts at fixing the error
deleteme = None
del cursorout
del cursorin
#Finish calculating time processing 1 entire year
stop = timeit.default_timer()
print stop - start
except Exception as e:
#print sys.exc_traceback.tb_lineno
return e
####
# MAIN
####
if __name__ == '__main__':
print('Starting script')
#Start timing entire process
start = timeit.default_timer()
#Year Range
#Entire dataset
#yearlist = list(range(1979, 2013))
#Sample
yearlist = list(range(1979, 1986))
#Create pool
print("Creating pool")
pool = multiprocessing.Pool(7)
#Call doCalc and pass the year list
pool.map(doCalc, yearlist)
## e = doCalc(1979)
print("Closing pool")
pool.close()
print("Joining pool")
pool.join()
#print(e[0])
stop = timeit.default_timer()
print stop - start
print("Complete")
The fix was found and posted http://forums.arcgis.com/threads/109606-Multiprocessing-script-errors-on-geoprocessing-line-of-code-INFADI-(Missing-Dir)?p=387987&posted=1#post387987
The truck is to set your os.environ("TEMP") as well as TMP uniquely within the process.
def doCalc(year):
yearstr = str(year)
import time
time.sleep(1.1)
newTempDir = r"C:\temp\gptmpenvr_" + time.strftime('%Y%m%d%H%M%S') + yearstr
os.mkdir(newTempDir)
os.environ["TEMP"] = newTempDir
os.environ["TMP"] = newTempDir
print('Starting doCalc: ' + yearstr)
I have a simple program which reads a large file containing few million rows, parses each row (numpy array) and converts into an array of doubles (python array) and later writes into an hdf5 file. I repeat this loop for multiple days. After reading each file, i delete all the objects and call garbage collector. When I run the program, First day is parsed without any error but on the second day i get MemoryError. I monitored the memory usage of my program, during first day of parsing, memory usage is around 1.5 GB. When the first day parsing is finished, memory usage goes down to 50 MB. Now when 2nd day starts and i try to read the lines from the file I get MemoryError. Following is the output of the program.
source file extracted at C:\rfadump\au\2012.08.07.txt
parsing started
current time: 2012-09-16 22:40:16.829000
500000 lines parsed
1000000 lines parsed
1500000 lines parsed
2000000 lines parsed
2500000 lines parsed
3000000 lines parsed
3500000 lines parsed
4000000 lines parsed
4500000 lines parsed
5000000 lines parsed
parsing done.
end time is 2012-09-16 23:34:19.931000
total time elapsed 0:54:03.102000
repacking file
done
> s:\users\aaj\projects\pythonhf\rfadumptohdf.py(132)generateFiles()
-> while single_date <= self.end_date:
(Pdb) c
*** 2012-08-08 ***
source file extracted at C:\rfadump\au\2012.08.08.txt
cought an exception while generating file for day 2012-08-08.
Traceback (most recent call last):
File "rfaDumpToHDF.py", line 175, in generateFile
lines = self.rawfile.read().split('|\n')
MemoryError
I am very sure that windows system task manager shows the memory usage as 50 MB for this process. It looks like the garbage collector or memory manager for Python is not calculating the free memory correcly. There should be lot of free memory but it thinks there is not enough.
Any idea?
EDIT
Adding my code here
I will put parts of my code. I am new to python, please pardon my python coding style.
module 1
def generateFile(self, current_date):
try:
print "*** %s ***" % current_date.strftime("%Y-%m-%d")
weekday=current_date.weekday()
if weekday >= 5:
print "skipping weekend"
return
self.taqdb = taqDB(self.index, self.offset)
cache_filename = os.path.join(self.cache_dir,current_date.strftime("%Y.%m.%d.h5"))
outputFile = config.hdf5.filePath(self.index, date=current_date)
print "cache file: ", cache_filename
print "output file: ", outputFile
tempdir = "C:\\rfadump\\"+self.region+"\\"
input_filename = tempdir + filename
print "source file extracted at %s " % input_filename
## universe
reader = rfaTextToTAQ.rfaTextToTAQ(self.tickobj) ## PARSER
count = 0
self.rawfile = open(input_filename, 'r')
lines = self.rawfile.read().split('|\n')
total_lines = len(lines)
self.rawfile.close()
del self.rawfile
print "parsing started"
start_time = dt.datetime.now()
print "current time: %s" % start_time
#while(len(lines) > 0):
while(count < total_lines):
#line = lines.pop(0) ## This slows down processing
result = reader.parseline(lines[count]+"|")
count += 1
if(count % 500000 == 0):
print "%d lines parsed" %(count)
if(result == None):
continue
ric, timestamp, quotes, trades, levelsUpdated, tradeupdate = result
if(len(levelsUpdated) == 0 and tradeupdate == False):
continue
self.taqdb.insert(result)
## write to hdf5 TODO
writer = h5Writer.h5Writer(cache_filename, self.tickobj)
writer.write(self.taqdb.groups)
writer.close()
del lines
del self.taqdb, self.tickobj
##########################################################
print "parsing done."
end_time = dt.datetime.now()
print "end time is %s" % end_time
print "total time elapsed %s" % (end_time - start_time)
defragger = hdf.HDF5Defragmenter()
defragger.Defrag(cache_filename,outputFile)
del defragger
print "done"
gc.collect(2)
except:
print "cought an exception while generating file for day %s." % current_date.strftime("%Y-%m-%d")
tb = traceback.format_exc()
print tb
module 2 - taqdb - to store parsed data in an array
class taqDB:
def __init__(self, index, offset):
self.index = index
self.tickcfg = config.hdf5.getTickConfig(index)
self.offset = offset
self.groups = {}
def getGroup(self,ric):
if (self.groups.has_key(ric) == False):
self.groups[ric] = {}
return self.groups[ric]
def getOrderbookArray(self, ric, group):
datasetname = orderBookName
prodtype = self.tickcfg.getProdType(ric)
if(prodtype == ProdType.INDEX):
return
orderbookArrayShape = self.tickcfg.getOrderBookArrayShape(prodtype)
if(group.has_key(datasetname) == False):
group[datasetname] = array.array("d")
orderbookArray = self.tickcfg.getOrderBookArray(prodtype)
return orderbookArray
else:
orderbookArray = group[datasetname]
if(len(orderbookArray) == 0):
return self.tickcfg.getOrderBookArray(prodtype)
lastOrderbook = orderbookArray[-orderbookArrayShape[1]:]
return np.array([lastOrderbook])
def addToDataset(self, group, datasetname, timestamp, arr):
if(group.has_key(datasetname) == False):
group[datasetname] = array.array("d")
arr[0,0]=timestamp
a1 = group[datasetname]
a1.extend(arr[0])
def addToOrderBook(self, group, timestamp, arr):
self.addToDataset(self, group, orderBookName, timestamp, arr)
def insert(self, data):
ric, timestamp, quotes, trades, levelsUpdated, tradeupdate = data
delta = dt.timedelta(hours=timestamp.hour,minutes=timestamp.minute, seconds=timestamp.second, microseconds=(timestamp.microsecond/1000))
timestamp = float(str(delta.seconds)+'.'+str(delta.microseconds)) + self.offset
## write to array
group = self.getGroup(ric)
orderbookUpdate = False
orderbookArray = self.getOrderbookArray(ric, group)
nonzero = quotes.nonzero()
orderbookArray[nonzero] = quotes[nonzero]
if(np.any(nonzero)):
self.addToDataset(group, orderBookName, timestamp, orderbookArray)
if(tradeupdate == True):
self.addToDataset(group, tradeName, timestamp, trades)
Module 3- Parser
class rfaTextToTAQ:
"""RFA Raw dump file reader. Readers single line (record) and returns an array or array of fid value pairs."""
def __init__(self,tickconfig):
self.tickconfig = tickconfig
self.token = ''
self.state = ReadState.SEQ_NUM
self.fvstate = fvstate.FID
self.quotes = np.array([]) # read from tickconfig
self.trades = np.array([]) # read from tickconfig
self.prodtype = ProdType.STOCK
self.allquotes = {}
self.alltrades = {}
self.acvol = 0
self.levelsUpdated = []
self.quoteUpdate = False
self.tradeUpdate = False
self.depth = 0
def updateLevel(self, index):
if(self.levelsUpdated.__contains__(index) == False):
self.levelsUpdated.append(index)
def updateQuote(self, fidindex, field):
self.value = float(self.value)
if(self.depth == 1):
index = fidindex[0]+(len(self.tickconfig.stkQuotes)*(self.depth - 1))
self.quotes[index[0]][fidindex[1][0]] = self.value
self.updateLevel(index[0])
else:
self.quotes[fidindex] = self.value
self.updateLevel(fidindex[0][0])
self.quoteUpdate = True
def updateTrade(self, fidindex, field):
#self.value = float(self.value)
if(self.tickconfig.tradeUpdate(self.depth) == False):
return
newacvol = float(self.value)
if(field == acvol):
if(self.value > self.acvol):
tradesize = newacvol - self.acvol
self.acvol = newacvol
self.trades[fidindex] = tradesize
if(self.trades.__contains__(0) == False):
self.tradeUpdate = True
else:
self.trades[fidindex] = self.value
if(not (self.trades[0,1]==0 or self.trades[0,2]==0)):
self.tradeUpdate = True
def updateResult(self):
field = ''
valid, field = field_dict.FIDToField(int(self.fid), field)
if(valid == False):
return
if(self.value == '0'):
return
if(self.prodtype == ProdType.STOCK):
fidindex = np.where(self.tickconfig.stkQuotes == field)
if(len(fidindex[0]) == 0):
fidindex = np.where(self.tickconfig.stkTrades == field)
if(len(fidindex[0]) == 0):
return
else:
self.updateTrade(fidindex, field)
else:
self.updateQuote(fidindex, field)
else:
fidindex = np.where(self.tickconfig.futQuotes == field)
if(len(fidindex[0]) == 0):
fidindex = np.where(self.tickconfig.futTrades == field)
if(len(fidindex[0]) == 0):
return
else:
self.updateTrade(fidindex, field)
else:
self.updateQuote(fidindex, field)
def getOrderBookTrade(self):
if (self.allquotes.has_key(self.ric) == False):
acvol = 0
self.allquotes[self.ric] = self.tickconfig.getOrderBookArray(self.prodtype)
trades = self.tickconfig.getTradesArray()
self.alltrades[self.ric] = [trades, acvol]
return self.allquotes[self.ric], self.alltrades[self.ric]
def parseline(self, line):
self.tradeUpdate = False
self.levelsUpdated = []
pos = 0
length = len(line)
self.state = ReadState.SEQ_NUM
self.fvstate = fvstate.FID
self.token = ''
ch = ''
while(pos < length):
prevChar = ch
ch = line[pos]
pos += 1
#SEQ_NUM
if(self.state == ReadState.SEQ_NUM):
if(ch != ','):
self.token += ch
else:
self.seq_num = int(self.token)
self.state = ReadState.TIMESTAMP
self.token = ''
# TIMESTAMP
elif(self.state == ReadState.TIMESTAMP):
if(ch == ' '):
self.token = ''
elif(ch != ','):
self.token += ch
else:
if(len(self.token) != 12):
print "Invalid timestamp format. %s. skipping line.\n", self.token
self.state = ReadState.SKIPLINE
else:
self.timestamp = datetime.strptime(self.token,'%H:%M:%S.%f')
self.state = ReadState.RIC
self.token = ''
# RIC
elif(self.state == ReadState.RIC):
if(ch != ','):
self.token += ch
else:
self.ric = self.token
self.token = ''
self.ric, self.depth = self.tickconfig.replaceRic(self.ric)
self.prodtype = self.tickconfig.getProdType(self.ric)
if(self.tickconfig.subscribed(self.ric)):
self.state = ReadState.UPDATE_TYPE
self.quotes, trades = self.getOrderBookTrade()
self.trades = trades[0]
self.acvol = trades[1]
else:
self.state = ReadState.SKIPLINE
# UPDATE_TYPE
elif(self.state == ReadState.UPDATE_TYPE):
if(ch != '|'):
self.token += ch
else:
self.update_type = self.token
self.token = ''
self.state = ReadState.FVPAIRS
#SKIPLINE
elif(self.state == ReadState.SKIPLINE):
return None
# FV PAIRS
elif(self.state == ReadState.FVPAIRS):
# FID
if(self.fvstate == fvstate.FID):
if(ch != ','):
if(ch.isdigit() == False):
self.token = self.value+ch
self.fvstate = fvstate.FIDVALUE
self.state = ReadState.FVPAIRS
else:
self.token += ch
else:
self.fid = self.token
self.token = ''
self.fvstate = fvstate.FIDVALUE
self.state = ReadState.FVPAIRS
# FIDVALUE
elif(self.fvstate == fvstate.FIDVALUE):
if(ch != '|'):
self.token += ch
else:
self.value = self.token
self.token = ''
self.state = ReadState.FVPAIRS
self.fvstate = fvstate.FID
# TODO set value
self.updateResult()
return self.ric, self.timestamp, self.quotes, self.trades, self.levelsUpdated, self.tradeUpdate
Thanks.
The only reliable way to free memory is to terminate the process.
So, if your main program spawns a worker process to do most of the work (the stuff that is done in one day) then when that worker process completes, the memory used will be freed:
import multiprocessing as mp
def work(date):
# Do most of the memory-intensive work here
...
while single_date <= self.end_date:
proc = mp.Process(target = work, args = (single_date,))
proc.start()
proc.join()