Processing big files using requests library and DictReader- Python - python

So I'm trying to process huge data files (> 1.6GB) using the requests library to deal with chunks of data.
import urllib2, json, csv
import requests
import multiprocessing
def getTaxiTrips(date):
"""
Gets the taxi trips occurred in NY from a starting date.
:param date: (Y-m-d).
:return: list of tuples (long, lat, drop off date).
"""
today = str(datetime.date(datetime.now())).split('-')
today_y = today[0]
today_m = today[1]
start = date.split('-')
start_y = start[0]
start_m = start[1]
print start_m+"-"+start_y +" / "+today_m+"-"+today_y
data = []
y = int(start_y)
m = int(start_m)
while int(start_y) <= int(today_y):
# Month transformation
if m > 12:
m %= 12
y += 1
mt = str(m) if m > 9 else '0' + str(m)
# Green cabs
if readCSV("https://storage.googleapis.com/tlc-trip-data/" + str(y) + \
"/green_tripdata_" + str(y) + "-" + mt + ".csv") is not None:
data.append("https://storage.googleapis.com/tlc-trip-data/" + str(y) + \
"/green_tripdata_" + str(y) + "-" + mt + ".csv")
if m == int(today_m):
break
m += 1
pool = multiprocessing.Pool(mps-1)
result = pool.map(consumeTaxiData, data)
pool.close()
pool.join()
return list(itertools.chain(*result))
def consumeTaxiData(url):
"""
Given a url, reads its content and process its data.
:param url: the url to be readen.
:return: a list of tuples in the form (long, lat, hour).
"""
print "Processing", url
points = []
r = requests.get(url, stream=True)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
reader = csv.DictReader(chunk.splitlines(), delimiter=',')
for line in reader:
print line
latitude = line.get('dropoff_latitude', None)
if latitude is None:
latitude = line.get('Dropoff_latitude', None)
longitude = line.get('dropoff_longitude', None)
if longitude is None:
longitude = line.get('Dropoff_longitude', None)
time = line.get('tpep_dropoff_datetime', None)
if time is None:
time = line.get('Lpep_dropoff_datetime', None)
if time is not None and latitude is not None and longitude is not None and \
datetime.strptime(time, '%Y-%m-%d %H:%M:%S') >= datetime.strptime(date, '%Y-%m-%d'):
time = roundTime(datetime.strptime(time, '%Y-%m-%d %H:%M:%S'), roundTo=60 * 60).hour
points.append((longitude, latitude, time))
return points
This is one file example:
https://storage.googleapis.com/tlc-trip-data/2015/green_tripdata_2015-06.csv
I'm not sure how to use this idea properly because the first line of the file is a header that specifies the key of some content that I need to capture. This may change among files, so I though about reading it using csv.DictRead. But I don't know if this works among chunks, because the header would be captured just in the first chunk, right? Is there a way to preserver the keys and be able to use csv.DictReader?

Related

PDF splitting with Bookmarks in python through PyPDF4 - bookmarks are losing in the output

I am trying to create a script to split the pdf pages for the given page numbers/labels from the pdf, the script are producing the split pdf correctly, but few information are losing, and need to be correct
book mark is losing in the separated pdf, if original pdf contains bookmark
if pdf contains the page labels with Roman and arabic page numbers,
like prelims part start with i, ii, iii, iv ... then again main matter part start with Arabic number 1, 2 ,3... and so on, when passing the value of arabic number for split, it is splitting the prelim part (Roman page number), ie., (start = 5, end = 10 ), but it is splitting from (start = V, End = X)
how to correct the issue in the below script
MWE
import re
import regex
import sys
import os
from iPython.ErrorLog import *
from iPython.LaTeX_QC_validation import *
#from pdfrw import PdfReader, PdfWriter
from PyPDF4 import PdfFileWriter,PdfFileReader
from pdfrw import PdfReader, PdfWriter
from pagelabels import PageLabels, PageLabelScheme
pg_info = open('pageinfo.txt','r')
pgcnt=pg_info.read()
pg_info.close()
print(pgcnt)
pgcnt= re.sub(r'<Misc="([0-9]+)" StartPage="([^">].*)">\s*<Misc="(?:[0-9]+)" EndPage="([^">].*)"/>',r'<Misc="\1" StartPage="\2" EndPage="\3"/>',pgcnt,re.I | re.S| re.M)
print(pgcnt)
pno = []
def value(rno):
r = rno.upper()
if (r == 'I'):
return 1
if (r == 'V'):
return 5
if (r == 'X'):
return 10
if (r == 'L'):
return 50
if (r == 'C'):
return 100
if (r == 'D'):
return 500
if (r == 'M'):
return 1000
return -1
def romanToDecimal(str):
res = 0
i = 0
while (i < len(str)):
# Getting value of symbol s[i]
s1 = value(str[i])
if (i + 1 < len(str)):
# Getting value of symbol s[i + 1]
s2 = value(str[i + 1])
# Comparing both values
if (s1 >= s2):
# Value of current symbol is greater
# or equal to the next symbol
res = res + s1
i = i + 1
else:
# Value of current symbol is greater
# or equal to the next symbol
res = res + s2 - s1
i = i + 2
else:
res = res + s1
i = i + 1
return res
def get_pageInfo(pginfo):
global pno
for m in re.finditer(r'<Misc="([0-9]+)" StartPage="([^">].*)" EndPage="([^">].*)"/>',pginfo,re.I):
Start_page = m.group(2)
End_page = m.group(3)
x = Start_page
y = End_page
numeric_test = x.isnumeric()
if not numeric_test:
Start_page = romanToDecimal(Start_page)
else:
Start_page = int(Start_page)
numeric_test = y.isnumeric()
if not numeric_test:
End_page = romanToDecimal(End_page)
else:
End_page = int(End_page)
print(x, Start_page, y, End_page)
pno.append((Start_page,End_page))
return pno
pgdetails = get_pageInfo(pgcnt)
print(pgdetails)
def pdf_splitter(file,start,end,fcount):
fix_start = start
#we will save new splited pdf as "nameofpdf splitted.pdf"
#example if pdf name is "abc.pdf" then it will be saved as "abc splitted.pdf"
new_file_name = str(fcount)+".pdf"
read_file = PdfFileReader(open(file,"rb")) #read pdf
new_pdf = PdfFileWriter() #create write object
start-=1
try:
with open(new_file_name,"wb") as f:
for i in range(start, end):
new_pdf.addPage(read_file.getPage(i))
new_pdf.write(f)
i+=1
f.close()
print("PDF splitted Successfully")
reader = PdfReader(new_file_name)
labels = PageLabels.from_pdf(reader)
newlabel = PageLabelScheme(startpage=0, # the index of the page of the PDF where the labels will start
style="roman lowercase", # See options in PageLabelScheme.styles()
prefix="",
firstpagenum=fix_start) # number to attribute to the first page of this index
labels.append(newlabel) # Adding our page labels to the existing ones
labels.write(reader)
writer = PdfWriter()
writer.trailer = reader
writer.write(new_file_name)
except Exception as e:
print(e)
x = 0
for i in pgdetails:
x += 1
#pvalaue = i
Start,End = i
pdf_splitter('input.pdf',Start,End,x)
sys.exit()
and the page information file (txt) will contain the below information
<Misc="1" StartPage="i">
<Misc="1" EndPage="ii"/>
<Misc="2" StartPage="ii">
<Misc="2" EndPage="ii"/>
<Misc="3" StartPage="iv">
<Misc="3" EndPage="iv"/>
<Misc="4" StartPage="v">
<Misc="4" EndPage="vi"/>
<Misc="5" StartPage="vii">
<Misc="5" EndPage="xiv"/>
<Misc="6" StartPage="xv">
<Misc="6" EndPage="xv"/>
<Misc="7" StartPage="xvi">
<Misc="7" EndPage="xviii"/>
<Misc="8" StartPage="xix">
<Misc="8" EndPage="xx"/>
Thanks in Advance

how to compare two strings in pandas large dataframe (python3.x)?

I have two DFs from 2 excel files.
1st file(awcProjectMaster)(1500 records)
projectCode projectName
100101 kupwara
100102 kalaroos
100103 tangdar
2nd file(village master)(more than 10 million records)
villageCode villageName
425638 wara
783651 tangdur
986321 kalaroo
I need to compare the projectName and villageName along with the percentage match.
The following code works fine but it is slow. How can I do the same thing in a more efficient way.
import pandas as pd
from datetime import datetime
df = pd.read_excel("C:\\Users\\Desktop\\awcProjectMaster.xlsx")
df1 = pd.read_excel("C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.xlsx")
def compare(prjCode, prjName, stCode, stName, dCode, dName, sdCode, sdName, vCode, vName):
with open(r"C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.txt", "a") as f:
percentMatch = 0
vLen = len(vName)
prjLen = len(prjName)
if vLen > prjLen:
if vName.find(prjName) != -1:
percentMatch = (prjLen / vLen) * 100
f.write(prjCode + "," + prjName + "," + vCode + "," + vName + "," + str(round(percentMatch)) + "," + stCode + "," + stName + "," + dCode + "," + dName + sdCode + "," + sdName + "\n")
else:
res = 0
# print(res)
elif prjLen >= vLen:
if prjName.find(vName) != -1:
percentMatch = (vLen / prjLen) * 100
f.write(prjCode + "," + prjName + "," + vCode + "," + vName + "," + str(round(percentMatch)) + "," + stCode + "," + stName + "," + dCode + "," + dName + sdCode + "," + sdName + "\n")
else:
res = 0
# print(res)
f.close()
for idx, row in df.iterrows():
for idxv, r in df1.iterrows():
compare(
str(row["ProjectCode"]),
row["ProjectName"].lower(),
str(r["StateCensusCode"]),
r["StateName"],
str(r["DistrictCode"]),
r["DistrictName"],
str(r["SubDistrictCode"]),
r["SubDistrictNameInEnglish"],
str(r["VillageCode"]),
r["VillageNameInEnglish"].lower(),
)
Your distance metric for the strings isn't too accurate, but if it works for you, fine. (You may want to look into other options like the builtin difflib, or the Python-Levenshtein module, though.)
If you really do need to compare 1,500 x 10,000,000 records pairwise, things are bound to take some time, but there are a couple things that we can do pretty easily to speed things up:
open the log file only once; there's overhead, sometimes significant, in that
refactor your comparison function into a separate unit, then apply the lru_cache() memoization decorator to make sure each pair is compared only once, and the subsequent result is cached in memory. (In addition, see how we sort the vName/prjName pair – since the actual order of the two strings doesn't matter, we end up with half the cache size.)
Then for general cleanliness,
use the csv module for streaming CSV into a file (the output format is slightly different than with your code, but you can change this with the dialect parameter to csv.writer()).
Hope this helps!
import pandas as pd
from datetime import datetime
from functools import lru_cache
import csv
df = pd.read_excel("C:\\Users\\Desktop\\awcProjectMaster.xlsx")
df1 = pd.read_excel("C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.xlsx")
log_file = open(r"C:\\Users\\Desktop\\prjToVillageStateWise\\stCodeVillage1To6.txt", "a")
log_writer = csv.writer(log_file)
#lru_cache()
def compare_vname_prjname(vName, prjName):
vLen = len(vName)
prjLen = len(prjName)
if vLen > prjLen:
if vName.find(prjName) != -1:
return (prjLen / vLen) * 100
elif prjLen >= vLen:
if prjName.find(vName) != -1:
return (vLen / prjLen) * 100
return None
def compare(prjCode, prjName, stCode, stName, dCode, dName, sdCode, sdName, vCode, vName):
# help the cache decorator out by halving the number of possible pairs:
vName, prjName = sorted([vName, prjName])
percent_match = compare_vname_prjname(vName, prjName)
if percent_match is None: # No match
return False
log_writer.writerow(
[
prjCode,
prjName,
vCode,
vName,
round(percent_match),
stCode,
stName,
dCode,
dName + sdCode,
sdName,
]
)
return True
for idx, row in df.iterrows():
for idxv, r in df1.iterrows():
compare(
str(row["ProjectCode"]),
row["ProjectName"].lower(),
str(r["StateCensusCode"]),
r["StateName"],
str(r["DistrictCode"]),
r["DistrictName"],
str(r["SubDistrictCode"]),
r["SubDistrictNameInEnglish"],
str(r["VillageCode"]),
r["VillageNameInEnglish"].lower(),
)

Perfomance improvement - Looping with Get Method

I've built a program to fill up a databank and, by the time, it's working. Basically, the program makes a request to the app I'm using (via REST API) returns the data I want and then manipulate to a acceptable form for the databank.
The problem is: the GET method makes the algorithm too slow, because I'm acessing the details of particular entries, so for each entry I have to make 1 request. I have something close to 15000 requests to do and each row in the bank is taking 1 second to be made.
Is there any possible way to make this requests faster? How can I improve the perfomance of this method? And by the way, any tips to measure the perfomance of the code?
Thanks in advance!!
here's the code:
# Retrieving all the IDs I want to get the detailed info
abc_ids = serializers.serialize('json', modelExample.objects.all(), fields=('id'))
abc_ids = json.loads(abc_ids)
abc_ids_size = len(abc_ids)
# Had to declare this guys right here because in the end of the code I use them in the functions to create and uptade the back
# And python was complaining that I stated before assign. Picked random values for them.
age = 0
time_to_won = 0
data = '2016-01-01 00:00:00'
# First Loop -> Request to the detailed info of ABC
for x in range(0, abc_ids_size):
id = abc_ids[x]['fields']['id']
url = requests.get(
'https://api.example.com/v3/abc/' + str(
id) + '?api_token=123123123')
info = info.json()
dealx = dict(info)
# Second Loop -> Picking the info I want to uptade and create in the bank
for key, result in dealx['data'].items():
# Relevant only for ModelExample -> UPTADE
if key == 'age':
result = dict(result)
age = result['total_seconds']
# Relevant only For ModelExample -> UPTADE
elif key == 'average_time_to_won':
result = dict(result)
time_to_won = result['total_seconds']
# Relevant For Model_Example2 -> CREATE
# Storing a date here to use up foward in a datetime manipulation
if key == 'add_time':
data = str(result)
elif key == 'time_stage':
# Each stage has a total of seconds that the user stayed in.
y = result['times_in_stages']
# The user can be in any stage he want, there's no rule about the order.
# But there's a record of the order he chose.
z = result['order_of_stages']
# Creating a list to fill up with all stages info and use in the bulk_create.
data_set = []
index = 0
# Setting the number of repititions base on the number of the stages in the list.
for elemento in range(0, len(z)):
data_set_i = {}
# The index is to define the order of the stages.
index = index + 1
for key_1, result_1 in y.items():
if int(key_1) == z[elemento]:
data_set_i['stage_id'] = int(z[elemento])
data_set_i['index'] = int(index)
data_set_i['abc_id'] = id
# Datetime manipulation
if result_1 == 0 and index == 1:
data_set_i['add_date'] = data
# I know that I totally repeated the code here, I was trying to get this part shorter
# But I could not get it right.
elif result_1 > 0 and index == 1:
data_t = datetime.strptime(data, "%Y-%m-%d %H:%M:%S")
data_sum = data_t + timedelta(seconds=result_1)
data_sum += timedelta(seconds=3)
data_nova = str(data_sum.year) + '-' + str(formaters.DateNine(
data_sum.month)) + '-' + str(formaters.DateNine(data_sum.day)) + ' ' + str(
data_sum.hour) + ':' + str(formaters.DateNine(data_sum.minute)) + ':' + str(
formaters.DateNine(data_sum.second))
data_set_i['add_date'] = str(data_nova)
else:
data_t = datetime.strptime(data_set[elemento - 1]['add_date'], "%Y-%m-%d %H:%M:%S")
data_sum = data_t + timedelta(seconds=result_1)
data_sum += timedelta(seconds=3)
data_nova = str(data_sum.year) + '-' + str(formaters.DateNine(
data_sum.month)) + '-' + str(formaters.DateNine(data_sum.day)) + ' ' + str(
data_sum.hour) + ':' + str(formaters.DateNine(data_sum.minute)) + ':' + str(
formaters.DateNine(data_sum.second))
data_set_i['add_date'] = str(data_nova)
data_set.append(data_set_i)
Model_Example2_List = [Model_Example2(**vals) for vals in data_set]
Model_Example2.objects.bulk_create(Model_Example2_List)
ModelExample.objects.filter(abc_id=id).update(age=age, time_to_won=time_to_won)
if the bottleneck is in your network request, there isn't much you can do except to perhaps use gzip or deflate but with requests ..
The gzip and deflate transfer-encodings are automatically decoded for
you.
If you want to be doubly sure, you can add the following headers to the get request.
{ 'Accept-Encoding': 'gzip,deflate'}
The other alternative is to use threading and have many requests operate in parrallel, a good option if you have lot's of bandwidth and multiple cores.
Lastly, there are lots of different ways to profile python including with cprofile + kcachegrind combo.

MultiThreading/Optimization Python Requests?

I am trying to optimize this code, as of right now it runs 340 Requests in 10 mins. I have trying to get 1800 requests in 30 mins. Since I can run a request every second, according to amazon api. Can I use multithreading with this code to increase the number of runs??
However, I was reading in the full data to the main function, should I split it now, how can I figure out how many each thread should take?
def newhmac():
return hmac.new(AWS_SECRET_ACCESS_KEY, digestmod=sha256)
def getSignedUrl(params):
hmac = newhmac()
action = 'GET'
server = "webservices.amazon.com"
path = "/onca/xml"
params['Version'] = '2013-08-01'
params['AWSAccessKeyId'] = AWS_ACCESS_KEY_ID
params['Service'] = 'AWSECommerceService'
params['Timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
key_values = [(urllib.quote(k), urllib.quote(v)) for k,v in params.items()]
key_values.sort()
paramstring = '&'.join(['%s=%s' % (k, v) for k, v in key_values])
urlstring = "http://" + server + path + "?" + \
('&'.join(['%s=%s' % (k, v) for k, v in key_values]))
hmac.update(action + "\n" + server + "\n" + path + "\n" + paramstring)
urlstring = urlstring + "&Signature="+\
urllib.quote(base64.encodestring(hmac.digest()).strip())
return urlstring
def readData():
data = []
with open("ASIN.csv") as f:
reader = csv.reader(f)
for row in reader:
data.append(row[0])
return data
def writeData(data):
with open("data.csv", "a") as f:
writer = csv.writer(f)
writer.writerows(data)
def main():
data = readData()
filtData = []
i = 0
count = 0
while(i < len(data) -10 ):
if (count %4 == 0):
time.sleep(1)
asins = ','.join([data[x] for x in range(i,i+10)])
params = {'ResponseGroup':'OfferFull,Offers',
'AssociateTag':'4chin-20',
'Operation':'ItemLookup',
'IdType':'ASIN',
'ItemId':asins}
url = getSignedUrl(params)
resp = requests.get(url)
responseSoup=BeautifulSoup(resp.text)
quantity = ['' if product.amount is None else product.amount.text for product in responseSoup.findAll("offersummary")]
price = ['' if product.lowestnewprice is None else product.lowestnewprice.formattedprice.text for product in responseSoup.findAll("offersummary")]
prime = ['' if product.iseligibleforprime is None else product.iseligibleforprime.text for product in responseSoup("offer")]
for zz in zip(asins.split(","), price,quantity,prime):
print zz
filtData.append(zz)
print i, len(filtData)
i+=10
count +=1
writeData(filtData)
threading.Timer(1.0, main).start()
If you are using python 3.2 you can use concurrent.futures library to make it easy to launch tasks in multiple threads. e.g. here I am simulating running 10 url parsing job in parallel, each one of which takes 1 sec, if run synchronously it would have taken 10 seconds but with thread pool of 10 should take about 1 seconds
import time
from concurrent.futures import ThreadPoolExecutor
def parse_url(url):
time.sleep(1)
print(url)
return "done."
st = time.time()
with ThreadPoolExecutor(max_workers=10) as executor:
for i in range(10):
future = executor.submit(parse_url, "http://google.com/%s"%i)
print("total time: %s"%(time.time() - st))
Output:
http://google.com/0
http://google.com/1
http://google.com/2
http://google.com/3
http://google.com/4
http://google.com/5
http://google.com/6
http://google.com/7
http://google.com/8
http://google.com/9
total time: 1.0066466331481934

Fatal Error (INFADI) Missing Directory - Multiprocessing python arcgis script error

I have written a script that uses pool.map to process multiple netCDF files and store information in a table. Each process runs a function to process one year. Each year has it's own individual file geodatabase, table within that geodatabase, and mxd. I also set the default workspace and scratch workspace to that geodatabase. For example when the function loads the year 1979 it accesses the 1979 geodatabase, 1979 table within that geodatabase, and 1979 mxd. 1980 would access the 1980 geodatabase, 1970 table within that geodatabase, and 1980 mxd.
If I run 1 process everything works fine. If I try to run 2 or more I get Fatal Error (INFADI) Missing Directory. Right now I'm running 6 processes. 4 Crash and the other 2 keep going without a problem.
Here is the code:
# Multiprocessing netCDF data into a table
######################################
import arcpy, calendar, datetime, numpy, multiprocessing, sys, re, timeit, os
from arcpy.sa import *
#Receive day and year and return the date str in MM/DD/YYYY
def getDate(day, year):
date = datetime.datetime(year, 1, 1) + datetime.timedelta(day)
date = date.timetuple()
date = str(date[1]) + '/' + str(date[2]) + '/' + str(date[0])
return date
#Main loop
#Receive a year int and process all dates within "good" months
def doCalc(year):
yearstr = str(year)
print('Starting doCalc: ' + yearstr)
############################################
#### CHANGE THIS INPUT ####
Species = 'Mallard'
Regiondb = 'North_America' #Spaces not allowed in filename to map
Region = 'Duck Zone' #Spaces allowed in DB
regionField = 'ZONE_NAME'
############################################
defaultGDB = "D:\\GIS\projects\\LCC_WSI_Climate\\year" + yearstr + ".gdb"
#Setting environmental variables
arcpy.env.workspace = defaultGDB
arcpy.env.scratchWorkspace = defaultGDB
arcpy.env.overwriteOutput = True
#desired months
goodmonth = (1, 2, 3, 9, 10, 11, 12)
#Acquire necessary extension and exit if it can't acquire
#Spatial Extension
try:
if arcpy.CheckExtension("Spatial") == "Available":
arcpy.CheckOutExtension("Spatial")
print("Acquired Spatial license")
else:
sys.exit("No Spatial Analyst license available")
except:
sys.exit("No Spatial Analyst license available")
#Geostats Extension
try:
if arcpy.CheckExtension("GeoStats") == "Available":
arcpy.CheckOutExtension("GeoStats")
print("Acquired GeoStats license")
else:
sys.exit("No GeoStats license available")
except:
sys.exit("No GeoStats license available")
#Try and except statements currently used for debugging and that is why the exceps are not specific.
try:
#Select map document and set up layers. Using a map document because NetCDFRasters aren't
#playing nice if not "living" in a document
print('Starting :' + yearstr)
start = timeit.default_timer()
mxd = arcpy.mapping.MapDocument("D:/GIS/projects/LCC_WSI_Climate/python code/WSI_maps"+yearstr+".mxd")
df = arcpy.mapping.ListDataFrames(mxd)[0]
#Set the table to write to according to the year received
for table in arcpy.mapping.ListTableViews(mxd):
if table.name == 'T'+yearstr:
WSITable = table
#Set the Clip layer according to the Region specified above
for dflayer in arcpy.mapping.ListLayers(mxd,"", df):
if dflayer.name == Region:
WSIClip = dflayer
if dflayer.name == 'wsi_Layer':
WSILayer = dflayer
#Set directory where netCDF files reside
direct = "D:/GIS/projects/LCC_WSI_Climate/python code/wsi/"
#Set netCDF file according to year received
inputLayer = direct +'wsi.' + yearstr + '.nc'
#If it's 1979 it starts in September.
if year == 1979:
startday = 243
else:
startday = 0
#Make sure the wsi_Layer is the correct file.
arcpy.MakeNetCDFRasterLayer_md(inputLayer, "wsi", "x", "y", "wsi_Layer")
#Checks if the current year is a leap year
if calendar.isleap(year):
maxday = 366
else:
maxday = 365
#Cycle through every day within the year
for daycnt in range(startday, maxday):
day = 0
sendday = daycnt+1
date = getDate(daycnt, year)
newdate = datetime.datetime(year, 1, 1) + datetime.timedelta(daycnt)
newdate = newdate.timetuple()
month = newdate[1]
day = newdate[2]
#If the month is not desired it will skip the day and continue with the next day
if month not in goodmonth:
continue
datestr = str(month) + '/' + str(day) + '/' + str(year)
print(datestr)
#Use the Select by Dimension tool to change the netCDF layer to the current date
WSILayerRas = Raster("wsi_Layer")
arcpy.SelectByDimension_md(WSILayerRas, [["time", date]],"BY_VALUE")
#Save the file in defaultGDB. Processing didn't work without saving.
WSILayerRas.save("Temp"+yearstr)
##########################################
## Regions
##
wsikm = 0
datalist = []
#Calculate time
time = 'time ' + str(date)
#Setup the cursor to write to the output Table defined above (taken from mxd).
cursorout = arcpy.da.InsertCursor(WSITable, ("CATEGORY", "STATE", "SUBCATEGORY", "DATE","SQKM", "SPECIES"))
#Setup search cursor to go through the input dataset and clip raster to the shape of each feature.
#Copy data to the output table
with arcpy.da.SearchCursor(WSIClip,(regionField, "SHAPE#", "STATE_NAME")) as cursorin:
for row in cursorin:
AOIname = row[0]
AOIshape = row[1]
AOIextent = AOIshape.extent
AOIstate = row[2]
#dealing with odd characters and spaces
AOIname = re.sub("\s+", "", AOIname)
AOIname = AOIname.strip()
AOIname = AOIname.replace("'", "")
AOIname = AOIname.replace("/", "_")
AOIstatea = re.sub("\s+", "", AOIstate)
#print('State: ' + AOIstate + ', AOI: ' + AOIname)
savetemp = AOIstatea + '_' + AOIname + '_' + yearstr
#Process crashes running this try/except. The except doesn't catch it.
try:
deleteme = Raster(arcpy.gp.ExtractByMask_sa(WSILayerRas, AOIshape))
except:
continue
deleteme.save(savetemp)
#Add raster to an array for deletion later
datalist.append(deleteme)
#Convert the Extracted raster to a NumPy array and extract desired values
#by incrementing a counter and calculating area.
my_array = arcpy.RasterToNumPyArray(deleteme)
rows, cols = my_array.shape
countAOI = 0
wsikm = 0
#time calculation
for rowNum in xrange(rows):
for colNum in xrange(cols):
value = my_array.item(rowNum, colNum)
if value >= 7.2:
countAOI +=1
wsikm = countAOI * 1024
#write to the output Table
cursorout.insertRow((Region,AOIstate, AOIname, datestr, wsikm, Species))
#Cleanup the geodatabase
## print('Cleaning up')
arcpy.Delete_management(savetemp)
datasetList = arcpy.ListDatasets("Extract_W*", "Raster")
try:
for dataset in datasetList:
arcpy.Delete_management(dataset)
except:
continue
#attempts at fixing the error
deleteme = None
del cursorout
del cursorin
#Finish calculating time processing 1 entire year
stop = timeit.default_timer()
print stop - start
except Exception as e:
#print sys.exc_traceback.tb_lineno
return e
####
# MAIN
####
if __name__ == '__main__':
print('Starting script')
#Start timing entire process
start = timeit.default_timer()
#Year Range
#Entire dataset
#yearlist = list(range(1979, 2013))
#Sample
yearlist = list(range(1979, 1986))
#Create pool
print("Creating pool")
pool = multiprocessing.Pool(7)
#Call doCalc and pass the year list
pool.map(doCalc, yearlist)
## e = doCalc(1979)
print("Closing pool")
pool.close()
print("Joining pool")
pool.join()
#print(e[0])
stop = timeit.default_timer()
print stop - start
print("Complete")
The fix was found and posted http://forums.arcgis.com/threads/109606-Multiprocessing-script-errors-on-geoprocessing-line-of-code-INFADI-(Missing-Dir)?p=387987&posted=1#post387987
The truck is to set your os.environ("TEMP") as well as TMP uniquely within the process.
def doCalc(year):
yearstr = str(year)
import time
time.sleep(1.1)
newTempDir = r"C:\temp\gptmpenvr_" + time.strftime('%Y%m%d%H%M%S') + yearstr
os.mkdir(newTempDir)
os.environ["TEMP"] = newTempDir
os.environ["TMP"] = newTempDir
print('Starting doCalc: ' + yearstr)

Categories

Resources