Data gets mixed up while trying to transfer it to arangodb - python

I'm trying to transfer ca. 10GB of json data (tweets in my case) to a collection in arangodb. I'm also trying to use joblib for it:
from ArangoConn import ArangoConn
import Userdata as U
import encodings
from joblib import Parallel,delayed
import json
from glob import glob
import time
def progress(total, prog, start, stri = ""):
if(prog == 0):
print("")
prog = 1;
perc = prog / total
diff = time.time() - start
rem = (diff / prog) * (total - prog)
bar = ""
for i in range(0,int(perc*20)):
bar = bar + "|"
for i in range(int(perc*20),20):
bar = bar + " "
print("\r"+"progress: " + "[" + bar + "] " + str(prog) + " of " +
str(total) + ": {0:.1f}% ".format(perc * 100) + "- " +
time.strftime("%H:%M:%S", time.gmtime(rem)) + " " + stri, end="")
def processfile(filepath):
file = open(filepath,encoding='utf-8')
s = file.read()
file.close()
data = json.loads(s)
Parallel(n_jobs=12, verbose=0, backend="threading"
(map(delayed(ArangoConn.createDocFromObject), data))
files = glob(U.path+'/*.json')
i = 1
j = len(files)
starttime = time.time()
for f in files:
progress(j,i,starttime,f)
i = i+1
processfile(f)
and
from pyArango.connection import Connection
import Userdata as U
import time
class ArangoConn:
def __init__(self,server,user,pw,db,collectionname):
self.server = server
self.user = user
self.pw = pw
self.db = db
self.collectionname = collectionname
self.connection = None
self.dbHandle = self.connect()
if not self.dbHandle.hasCollection(name=self.collectionname):
coll = self.dbHandle.createCollection(name=collectionname)
else:
coll = self.dbHandle.collections[collectionname]
self.collection = coll
def db_createDocFromObject(self, obj):
data = obj.__dict__()
doc = self.collection.createDocument()
for key,value in data.items():
doc[key] = value
doc._key= str(int(round(time.time() * 1000)))
doc.save()
def connect(self):
self.connection = Connection(arangoURL=self.server + ":8529",
username=self.user, password=self.pw)
if not self.connection.hasDatabase(self.db):
db = self.connection.createDatabase(name=self.db)
else:
db = self.connection.databases.get(self.db)
return db
def disconnect(self):
self.connection.disconnectSession()
def getAllData(self):
docs = []
for doc in self.collection.fetchAll():
docs.append(self.doc_to_result(doc))
return docs
def addData(self,obj):
self.db_createDocFromObject(obj)
def search(self,collection,search,prop):
docs = []
aql = """FOR q IN """+collection+""" FILTER q."""+prop+""" LIKE
"%"""+search+"""%" RETURN q"""
results = self.dbHandle.AQLQuery(aql, rawResults=False, batchSize=1)
for doc in results:
docs.append(self.doc_to_result(doc))
return docs
def doc_to_result(self,arangodoc):
modstore = arangodoc.getStore()
modstore["_key"] = arangodoc._key
return modstore
def db_createDocFromJson(self,json):
for d in json:
doc = self.collection.createDocument()
for key,value in d.items():
doc[key] = value
doc._key = str(int(round(time.time() * 1000)))
doc.save()
#staticmethod
def createDocFromObject(obj):
c = ArangoConn(U.url, U.user, U.pw, U.db, U.collection)
data = obj
doc = c.collection.createDocument()
for key, value in data.items():
doc[key] = value
doc._key = doc["id"]
doc.save()
c.connection.disconnectSession()
It kinda works like that. My problem is that the data that lands in the database is somehow mixed up.
as you can see in the screenshot "id" and "id_str" are not the same - as they should be.
what i investigated so far:
I thought that at some points the default keys in the databese may "collide"
because of the threading so I set the key to the tweet id.
I tried to do it without multiple threads. the threading doesn't seem to be
the problem
I looked at the data I send to the database... everything seems to be fine
But as soon as I communicate with the db the data mixes up.
My professor thought that maybe something in pyarango isn't threadsafe and it messes up the data but I don't think so as threading doesn't seem to be the problem.
I have no ideas left where this behavior could come from...
Any ideas?

The screenshot shows the following values:
id : 892886691937214500
id_str : 892886691937214465
It looks like somewhere along the way the value is converted to an IEEE754 double, which cannot safely represent the latter value. So there is potentially some precision loss due to conversion.
A quick example in node.js (JavaScript is using IEEE754 doubles for any number values greater than 0xffffffff) shows that this is likely the problem cause:
$ node
> 892886691937214500
892886691937214500
> 892886691937214465
892886691937214500
So the question is where the conversion does happen. Can you check whether the python client program is correctly sending the expected values to ArangoDB, or does it already send the converted/truncated values?
In general, any integer number that exceeds 0x7fffffffffffffff will be truncated when stored in ArangoDB, or converted to an IEEE754 double. This can be avoided by storing the number values inside a string, but of course comparing two number strings will produce different results than comparing two numbers (e.g. "10" < "9" vs. 10 > 9).

Related

Confused by a python type errror

I've been using python for a little while and have made some improvements but this a new error to me. I'm trying to learn social media analysis for my career and that's why I am trying out this set of code here.
I've de bugged one error but this one, which appears at line 81, has got me stumped as I can't see why the function "def get_user_objects(follower_ids):" returns none and what i'd need to change it in accordance with previous advice on other questions here.
Here's script to that point for simplicity. All help appreciated.
The error, to repeat is TypeError: object of type 'NoneType' has no len()
from tweepy import OAuthHandler
from tweepy import API
from collections import Counter
from datetime import datetime, date, time, timedelta
import sys
import json
import os
import io
import re
import time
# Helper functions to load and save intermediate steps
def save_json(variable, filename):
with io.open(filename, "w", encoding="utf-8") as f:
f.write(str(json.dumps(variable, indent=4, ensure_ascii=False)))
def load_json(filename):
ret = None
if os.path.exists(filename):
try:
with io.open(filename, "r", encoding="utf-8") as f:
ret = json.load(f)
except:
pass
return ret
def try_load_or_process(filename, processor_fn, function_arg):
load_fn = None
save_fn = None
if filename.endswith("json"):
load_fn = load_json
save_fn = save_json
else:
load_fn = load_bin
save_fn = save_bin
if os.path.exists(filename):
print("Loading " + filename)
return load_fn(filename)
else:
ret = processor_fn(function_arg)
print("Saving " + filename)
save_fn(ret, filename)
return ret
# Some helper functions to convert between different time formats and
perform date calculations
def twitter_time_to_object(time_string):
twitter_format = "%a %b %d %H:%M:%S %Y"
match_expression = "^(.+)\s(\+[0-9][0-9][0-9][0-9])\s([0-9][0-9][0-9]
[09])$"
match = re.search(match_expression, time_string)
if match is not None:
first_bit = match.group(1)
second_bit = match.group(2)
last_bit = match.group(3)
new_string = first_bit + " " + last_bit
date_object = datetime.strptime(new_string, twitter_format)
return date_object
def time_object_to_unix(time_object):
return int(time_object.strftime("%s"))
def twitter_time_to_unix(time_string):
return time_object_to_unix(twitter_time_to_object(time_string))
def seconds_since_twitter_time(time_string):
input_time_unix = int(twitter_time_to_unix(time_string))
current_time_unix = int(get_utc_unix_time())
return current_time_unix - input_time_unix
def get_utc_unix_time():
dts = datetime.utcnow()
return time.mktime(dts.timetuple())
# Get a list of follower ids for the target account
def get_follower_ids(target):
return auth_api.followers_ids(target)
# Twitter API allows us to batch query 100 accounts at a time
# So we'll create batches of 100 follower ids and gather Twitter User
objects for each batch
def get_user_objects(follower_ids):
batch_len = 100
num_batches = len(follower_ids)/100
batches = (follower_ids[i:i+batch_len] for i in range(0,
len(follower_ids), batch_len))
all_data = []
for batch_count, batch in enumerate(batches):
sys.stdout.write("\r")
sys.stdout.flush()
sys.stdout.write("Fetching batch: " + str(batch_count) + "/" +
str(num_batches))
sys.stdout.flush()
users_list = auth_api.lookup_users(user_ids=batch)
users_json = (map(lambda t: t._json, users_list))
all_data += users_json
return all_data
# Creates one week length ranges and finds items that fit into those range
boundaries
def make_ranges(user_data, num_ranges=20):
range_max = 604800 * num_ranges
range_step = range_max/num_ranges
# We create ranges and labels first and then iterate these when going
through the whole list
# of user data, to speed things up
ranges = {}
labels = {}
for x in range(num_ranges):
start_range = x * range_step
end_range = x * range_step + range_step
label = "%02d" % x + " - " + "%02d" % (x+1) + " weeks"
labels[label] = []
ranges[label] = {}
ranges[label]["start"] = start_range
ranges[label]["end"] = end_range
for user in user_data:
if "created_at" in user:
account_age = seconds_since_twitter_time(user["created_at"])
for label, timestamps in ranges.iteritems():
if account_age > timestamps["start"] and account_age <
timestamps["end"]:
entry = {}
id_str = user["id_str"]
entry[id_str] = {}
fields = ["screen_name", "name", "created_at",
"friends_count", "followers_count", "favourites_count", "statuses_count"]
for f in fields:
if f in user:
entry[id_str][f] = user[f]
labels[label].append(entry)
return labels
if __name__ == "__main__":
account_list = []
if (len(sys.argv) > 1):
account_list = sys.argv[1:]
if len(account_list) < 1:
print("No parameters supplied. Exiting.")
sys.exit(0)
consumer_key="XXXXXXX"
consumer_secret="XXXXXX"
access_token="XXXXXXX"
access_token_secret="XXXXXXXX"
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
auth_api = API(auth)
for target in account_list:
print("Processing target: " + target)
# Get a list of Twitter ids for followers of target account and save it
filename = target + "_follower_ids.json"
follower_ids = try_load_or_process(filename, get_follower_ids,
target)
# Fetch Twitter User objects from each Twitter id found and save the data
filename = target + "_followers.json"
user_objects = try_load_or_process(filename, get_user_objects,
follower_ids)
total_objects = len(user_objects)
# Record a few details about each account that falls between specified age
ranges
ranges = make_ranges(user_objects)
filename = target + "_ranges.json"
save_json(ranges, filename)
# Print a few summaries
print
print("\t\tFollower age ranges")
print("\t\t===================")
total = 0
following_counter = Counter()
for label, entries in sorted(ranges.iteritems()):
print("\t\t" + str(len(entries)) + " accounts were created
within " + label)
total += len(entries)
for entry in entries:
for id_str, values in entry.iteritems():
if "friends_count" in values:
following_counter[values["friends_count"]] += 1
print("\t\tTotal: " + str(total) + "/" + str(total_objects))
print
print("\t\tMost common friends counts")
print("\t\t==========================")
total = 0
for num, count in following_counter.most_common(20):
total += count
print("\t\t" + str(count) + " accounts are following " +
str(num) + " accounts")
print("\t\tTotal: " + str(total) + "/" + str(total_objects))
print
print
The immediate problem is in load_json: you assume its return value is a list or dict, or something that can be passed to len. However, it can return None in a number of circumstances:
The file to read from isn't found
There is some error reading from the file
There is a problem decoding the contents of the file
The file contains just the JSON value null.
At no point after you call load_json do you check its return value.
Worse, you catch and ignore any exception that might occur in load_json, causing it to silently return None with no indication that something went wrong.
The function would be better written like
def load_json(filename):
with io.open(filename, "r", encoding="utf-8") as f:
return json.load(f)
At least now, any errors will raise an uncaught exception, making it more obvious that there was a problem and providing a clue as to what the problem was. The golden rule of exception handling is to only catch the exceptions you can do something about, and if you can't do anything about a caught exception, re-raise it.
You could check for the resultant value and follow accordingly:
# Fetch Twitter User objects from each Twitter id found and save the data
filename = target + "_followers.json"
res_get_user_objects = get_user_objects()
if res_get_user_objects is not None:
user_objects = try_load_or_process(filename, get_user_objects,
follower_ids)
total_objects = len(user_objects)
else:
# handle it otherwise

Python multicore CSV short program, advice/help needed

I'm a hobby coder started with AHK, then some java and now I try to learn Python. I have searched and found some tips but I have yet not been able to implement it into my own code.
Hopefully someone here can help me, it's a very short program.
I'm using .txt csv database with ";" as a separator.
DATABASE EXAMPLE:
Which color is normally a cat?;Black
How tall was the longest man on earth?;272 cm
Is the earth round?;Yes
The database now consists of 20.000 lines which makes the program "to slow", only using 25% CPU (1 core).
If I can make it use all 4 cores (100%) I guess it would perform the task alot faster. The task is basically to compare the CLIPBOARD with the database and if there is a match, it should give me an answer as a return. Perhaps also I can separate the database into 4 pieces?
The code right now looks like this! Not more then 65 lines and its doing its job (but to slow). Advice on how I can make this process into multi core needed.
import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
def load_db():
while True:
try:
# Read and create database
db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
db = db.drop_duplicates()
return db
except:
print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
def top_answers(db, question):
db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = db.sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
return db_sorted
def write_txt(top):
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar.txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
def main():
try:
db = load_db()
last_db_reload = time.time()
while True:
# Get contents of clipboard
question = pp.paste()
# Rank answer
top = top_answers(db, question)
# If answer was found, show results
if len(top) > 0:
write_txt(top)
time.sleep(fall_back_time)
except:
print("Error in main(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
if name == 'main':
main()'
If you could divide the db into four equally large you could process them in parallel like this:
import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
import threading
ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
def worker(thread_id, question):
thread_id = str(thread_id)
db = pd.read_csv(db_file_path + thread_id, sep=db_separator, encoding=db_encoding)
db = db.drop_duplicates()
db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = db.sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
top = db_sorted
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar" + thread_id + ".txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
return
def main():
question = pp.paste()
for i in range(1, 4):
t = threading.Thread(target=worker, args=(i, question))
t.start()
t.join()
if name == 'main':
main()
The solution with multiprocessing:
import time
import pyperclip as pp
import pandas as pd
#import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy as np
# pathos uses better pickle to tranfer more complicated objects
from pathos.multiprocessing import Pool
from functools import reduce
import sys
import os
from contextlib import closing
ratio_threshold = 70
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
chunked_db = []
NUM_PROCESSES = os.cpu_count()
def load_db():
while True:
try:
# Read and create database
db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
db.columns = ['question', 'answer']
#db = db.drop_duplicates() # i drop it for experiment
break
except:
print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
# split database into equal chunks:
# (if you have a lot of RAM, otherwise you
# need to compute ranges in db, something like
# chunk_size = len(db)//NUM_PROCESSES
# ranges[i] = (i*chunk_size, (i+1)*cjunk_size)
# and pass ranges in original db to processes
chunked_db = np.split(db, [NUM_PROCESSES], axis=0)
return chunked_db
def top_answers_multiprocessed(question, chunked_db):
# on unix, python uses 'fork' mode by default
# so the process has 'copy-on-change' access to all global variables
# i.e. if process will change something in db, it will be copied to it
# with a lot of overhead
# Unfortunately, I'fe heard that on Windows only 'spawn' mode with full
# copy of everything is used
# Process pipeline uses pickle, it's quite slow.
# so on small database you may not have benefit from multiprocessing
# If you are going to transfer big objects in or out, look
# in the direction of multiprocessing.Array
# this solution is not fully efficient,
# as pool is recreated each time
# You can create daemon processes which will monitor
# Queue for incoming questions, but it's harder to implement
def top_answers(idx):
# question is in the scope of parent function,
chunked_db[idx]['ratio'] = chunked_db[idx]['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = chunked_db[idx].sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
return db_sorted
with closing(Pool(processes=NUM_PROCESSES)) as pool:
# chunked_db is a list of databases
# they are in global scope, we send only index beacause
# all the data set is pickled
num_chunks = len(chunked_db)
# apply function top_answers across generator range(num_chunks)
res = pool.imap_unordered(top_answers, range(num_chunks))
res = list(res)
# now res is list of dataframes, let's join it
res_final = reduce(lambda left,right: pd.merge(left,right,on='ratio'), res)
return res_final
def write_txt(top):
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar.txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
def mainfunc():
global chunked_db
chunked_db = load_db()
last_db_reload = time.time()
print('db loaded')
last_clip = ""
while True:
# Get contents of clipboard
try:
new_clip = pp.paste()
except:
continue
if (new_clip != last_clip) and (len(new_clip)> 0):
print(new_clip)
last_clip = new_clip
question = new_clip.strip()
else:
continue
# Rank answer
top = top_answers_multiprocessed(question, chunked_db)
# If answer was found, show results
if len(top) > 0:
#write_txt(top)
print(top)
if __name__ == '__main__':
mainfunc()

Manage Python Multiprocessing with MongoDB

I'm trying to run my code with a multiprocessing function but mongo keep returning
"MongoClient opened before fork. Create MongoClient with
connect=False, or create client after forking."
I really doesn't understand how i can adapt my code to this.
Basically the structure is:
db = MongoClient().database
db.authenticate('user', 'password', mechanism='SCRAM-SHA-1')
collectionW = db['words']
collectionT = db['sinMemo']
collectionL = db['sinLogic']
def findW(word):
rows = collectionw.find({"word": word})
ind = 0
for row in rows:
ind += 1
id = row["_id"]
if ind == 0:
a = ind
else:
a = id
return a
def trainAI(stri):
...
if findW(word) == 0:
_id = db['words'].insert(
{"_id": getNextSequence(db.counters, "nodeid"), "word": word})
story = _id
else:
story = findW(word)
...
def train(index):
# searching progress
progFile = "./train/progress{0}.txt".format(index)
trainFile = "./train/small_file_{0}".format(index)
if os.path.exists(progFile):
f = open(progFile, "r")
ind = f.read().strip()
if ind != "":
pprint(ind)
i = int(ind)
else:
pprint("No progress saved or progress lost!")
i = 0
f.close()
else:
i = 0
#get the number of line of the file
rangeC = rawbigcount(trainFile)
#fix unicode
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
files = io.open(trainFile, "r", encoding="utf8")
str1 = ""
str2 = ""
filex = open(progFile, "w")
with progressbar.ProgressBar(max_value=rangeC) as bar:
for line in files:
line = line.replace("\n", "")
if i % 2 == 0:
str1 = line.translate(non_bmp_map)
else:
str2 = line.translate(non_bmp_map)
bar.update(i)
trainAI(str1 + " " + str2)
filex.seek(0)
filex.truncate()
filex.write(str(i))
i += 1
#multiprocessing function
maxProcess = 3
def f(l, i):
l.acquire()
train(i + 1)
l.release()
if __name__ == '__main__':
lock = Lock()
for num in range(maxProcess):
pprint("start " + str(num))
Process(target=f, args=(lock, num)).start()
This code is made for reading 4 different file in 4 different process and at the same time insert the data in the database.
I copied only part of the code for make you understand the structure of it.
I've tried to add connect=False to this code but nothing...
db = MongoClient(connect=False).database
db.authenticate('user', 'password', mechanism='SCRAM-SHA-1')
collectionW = db['words']
collectionT = db['sinMemo']
collectionL = db['sinLogic']
then i've tried to move it in the f function (right before train() but what i get is that the program doesn't find collectionW,collectionT and collectionL.
I'm not very expert of python or mongodb so i hope that this is not a silly question.
The code is running under Ubuntu 16.04.2 with python 2.7.12
db.authenticate will have to connect to mongo server and it will try to make a connection. So, even though connect=False is being used, db.authenticate will require a connection to be open.
Why don't you create the mongo client instance after fork? That's look like the easiest solution.
Since db.authenticate must open the MongoClient and connect to the server, it creates connections which won't work in the forked subprocess. Hence, the error message. Try this instead:
db = MongoClient('mongodb://user:password#localhost', connect=False).database
Also, delete the Lock l. Acquiring a lock in one subprocess has no effect on other subprocesses.
Here is how I did it for my problem:
import pathos.pools as pp
import time
import db_access
class MultiprocessingTest(object):
def __init__(self):
pass
def test_mp(self):
data = [[form,'form_number','client_id'] for form in range(5000)]
pool = pp.ProcessPool(4)
pool.map(db_access.insertData, data)
if __name__ == '__main__':
time_i = time.time()
mp = MultiprocessingTest()
mp.test_mp()
time_f = time.time()
print 'Time Taken: ', time_f - time_i
Here is db_access.py:
from pymongo import MongoClient
def insertData(form):
client = MongoClient()
db = client['TEST_001']
db.initialization.insert({
"form": form[0],
"form_number": form[1],
"client_id": form[2]
})
This is happening to your code because you are initiating MongoCLient() once for all the sub-processes. MongoClient is not fork safe. So, initiating inside each function works and let me know if there are other solutions.

Calculate Each dropbox folder size recursively using python api

EDIT: I want to calculate each folder size not just entire dropbox size... My code is working fine for whole dropbox size
I am having difficulty in calculating each folder size of dropbox using python api
as dropbox returns folder size as zero
here's my code so far but it's giving me wrong answer
def main(dp_path):
a= client.metadata(dp_path)
size_local = 0
for x in a['contents']:
if x['is_dir']==False:
global size
size += int(x['bytes'])
size_local += int(x['bytes'])
#print "Total size so far :"+str(size/(1024.00*1024.00))+" Mb..."
if x['is_dir']==True:
a = main(str(x['path']))
print str(x['path'])+" size=="+str(a/(1024.00*1024.00))+" Mb..."
return size_local+size
if __name__ == '__main__':
global size
size=0
main('/')
print str(size/(1024.00*1024.00))+" Mb"
EDIT 2: It seems I misunderstood the question. Here's code that prints out the sizes of each folder (in order of decreasing size):
from dropbox.client import DropboxClient
from collections import defaultdict
client = DropboxClient('<YOUR ACCESS TOKEN>')
sizes = {}
cursor = None
while cursor is None or result['has_more']:
result = client.delta(cursor)
for path, metadata in result['entries']:
sizes[path] = metadata['bytes'] if metadata else 0
cursor = result['cursor']
foldersizes = defaultdict(lambda: 0)
for path, size in sizes.items():
segments = path.split('/')
for i in range(1, len(segments)):
folder = '/'.join(segments[:i])
if folder == '': folder = '/'
foldersizes[folder] += size
for folder in reversed(sorted(foldersizes.keys(), key=lambda x: foldersizes[x])):
print '%s: %d' % (folder, foldersizes[folder])
EDIT: I had a major bug in the second code snippet (the delta one), and I've now tested all three and found them all to report the same number.
This works:
from dropbox.client import DropboxClient
client = DropboxClient('<YOUR ACCESS TOKEN>')
def size(path):
return sum(
f['bytes'] if not f['is_dir'] else size(f['path'])
for f in client.metadata(path)['contents']
)
print size('/')
But it's much more efficient to use /delta:
sizes = {}
cursor = None
while cursor is None or result['has_more']:
result = client.delta(cursor)
for path, metadata in result['entries']:
sizes[path] = metadata['bytes'] if metadata else 0
cursor = result['cursor']
print sum(sizes.values())
And if you truly just need to know the overall usage for the account, you can just do this:
quota_info = client.account_info()['quota_info']
print quota_info['normal'] + quota_info['shared']

Converting an UNIX python program to work in windows [closed]

This question is unlikely to help any future visitors; it is only relevant to a small geographic area, a specific moment in time, or an extraordinarily narrow situation that is not generally applicable to the worldwide audience of the internet. For help making this question more broadly applicable, visit the help center.
Closed 9 years ago.
I need to make a program that drives a DYMO LabelManager PnP label printing device. DYMO provides a SDK for this purpose, but after some desperate trying, I'd say the SDK is useless. Then I found a program which is just what I need, written by a guy named S.Bronner. But the problem is that his program is made for Python in UNIX, and I would need it to work in Windows with python. So I'm asking, is there anyone who could examine this code and convert it to work in windows for me? My Python skills are not good enough to accomplish this. Here is the code which should be converted:
#!/usr/bin/env python
DEV_CLASS = 3
DEV_VENDOR = 0x0922
DEV_PRODUCT = 0x1001
DEV_NODE = None
DEV_NAME = 'Dymo LabelManager PnP'
FONT_FILENAME = '/usr/share/fonts/truetype/ttf-bitstream-vera/Vera.ttf'
FONT_SIZERATIO = 7./8
import Image
import ImageDraw
import ImageFont
import array
import fcntl
import os
import re
import struct
import subprocess
import sys
import termios
import textwrap
class DymoLabeler:
"""
Create and work with a Dymo LabelManager PnP object.
This class contains both mid-level and high-level functions. In general,
the high-level functions should be used. However, special purpose usage
may require the mid-level functions. That is why they are provided.
However, they should be well understood before use. Look at the
high-level functions for help. Each function is marked in its docstring
with 'HLF' or 'MLF' in parentheses.
"""
def __init__(self, dev):
"""Initialize the LabelManager object. (HLF)"""
self.maxBytesPerLine = 8 # 64 pixels on a 12mm-tape
self.ESC = 0x1b
self.SYN = 0x16
self.cmd = []
self.rsp = False
self.bpl = None
self.dtb = 0
if not os.access(dev, os.R_OK | os.W_OK): return False
self.dev = open(dev, 'r+')
def sendCommand(self):
"""Send the already built command to the LabelManager. (MLF)"""
if len(self.cmd) == 0: return
cmdBin = array.array('B', self.cmd)
cmdBin.tofile(self.dev)
self.cmd = []
if not self.rsp: return
self.rsp = False
rspBin = self.dev.read(8)
rsp = array.array('B', rspBin).tolist()
return rsp
def resetCommand(self):
"""Remove a partially built command. (MLF)"""
self.cmd = []
self.rsp = False
def buildCommand(self, cmd):
"""Add the next instruction to the command. (MLF)"""
self.cmd += cmd
def statusRequest(self):
"""Set instruction to get the device's status. (MLF)"""
cmd = [self.ESC, ord('A')]
self.buildCommand(cmd)
self.rsp = True
def dotTab(self, value):
"""Set the bias text height, in bytes. (MLF)"""
if value < 0 or value > self.maxBytesPerLine: raise ValueError
cmd = [self.ESC, ord('B'), value]
self.buildCommand(cmd)
self.dtb = value
self.bpl = None
def tapeColor(self, value):
"""Set the tape color. (MLF)"""
if value < 0: raise ValueError
cmd = [self.ESC, ord('C'), value]
self.buildCommand(cmd)
def bytesPerLine(self, value):
"""Set the number of bytes sent in the following lines. (MLF)"""
if value < 0 or value + self.dtb > self.maxBytesPerLine: raise ValueError
if value == self.bpl: return
cmd = [self.ESC, ord('D'), value]
self.buildCommand(cmd)
self.bpl = value
def cut(self):
"""Set instruction to trigger cutting of the tape. (MLF)"""
cmd = [self.ESC, ord('E')]
self.buildCommand(cmd)
def line(self, value):
"""Set next printed line. (MLF)"""
self.bytesPerLine(len(value))
cmd = [self.SYN] + value
self.buildCommand(cmd)
def chainMark(self):
"""Set Chain Mark. (MLF)"""
self.dotTab(0)
self.bytesPerLine(self.maxBytesPerLine)
self.line([0x99] * self.maxBytesPerLine)
def skipLines(self, value):
"""Set number of lines of white to print. (MLF)"""
if value <= 0: raise ValueError
self.bytesPerLine(0)
cmd = [self.SYN] * value
self.buildCommand(cmd)
def initLabel(self):
"""Set the label initialization sequence. (MLF)"""
cmd = [0x00] * 8
self.buildCommand(cmd)
def getStatus(self):
"""Ask for and return the device's status. (HLF)"""
self.statusRequest()
rsp = self.sendCommand()
print rsp
def printLabel(self, lines, dotTab):
"""Print the label described by lines. (HLF)"""
self.initLabel
self.tapeColor(0)
self.dotTab(dotTab)
for line in lines:
self.line(line)
self.skipLines(56) # advance printed matter past cutter
self.skipLines(56) # add symmetric margin
self.statusRequest()
rsp = self.sendCommand()
print rsp
def die(message=None):
if message: print >> sys.stderr, message
sys.exit(1)
def pprint(par, fd=sys.stdout):
rows, columns = struct.unpack('HH', fcntl.ioctl(sys.stderr, termios.TIOCGWINSZ, struct.pack('HH', 0, 0)))
print >> fd, textwrap.fill(par, columns)
def getDeviceFile(classID, vendorID, productID):
# find file containing the device's major and minor numbers
searchdir = '/sys/bus/hid/devices'
pattern = '^%04d:%04X:%04X.[0-9A-F]{4}$' % (classID, vendorID, productID)
deviceCandidates = os.listdir(searchdir)
foundpath = None
for devname in deviceCandidates:
if re.match(pattern, devname):
foundpath = os.path.join(searchdir, devname)
break
if not foundpath: return
searchdir = os.path.join(foundpath, 'hidraw')
devname = os.listdir(searchdir)[0]
foundpath = os.path.join(searchdir, devname)
filepath = os.path.join(foundpath, 'dev')
# get the major and minor numbers
f = open(filepath, 'r')
devnums = [int(n) for n in f.readline().strip().split(':')]
f.close()
devnum = os.makedev(devnums[0], devnums[1])
# check if a symlink with the major and minor numbers is available
filepath = '/dev/char/%d:%d' % (devnums[0], devnums[1])
if os.path.exists(filepath):
return os.path.realpath(filepath)
# check if the relevant sysfs path component matches a file name in
# /dev, that has the proper major and minor numbers
filepath = os.path.join('/dev', devname)
if os.stat(filepath).st_rdev == devnum:
return filepath
# search for a device file with the proper major and minor numbers
for dirpath, dirnames, filenames in os.walk('/dev'):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
if os.stat(filepath).st_rdev == devnum:
return filepath
def access_error(dev):
pprint('You do not have sufficient access to the device file %s:' % dev, sys.stderr)
subprocess.call(['ls', '-l', dev], stdout=sys.stderr)
print >> sys.stderr
pprint('You probably want to add a rule in /etc/udev/rules.d along the following lines:', sys.stderr)
print >> sys.stderr, ' SUBSYSTEM=="hidraw", \\'
print >> sys.stderr, ' ACTION=="add", \\'
print >> sys.stderr, ' DEVPATH=="/devices/pci[0-9]*/usb[0-9]*/0003:0922:1001.*/hidraw/hidraw0", \\'
print >> sys.stderr, ' GROUP="plugdev"'
print >> sys.stderr
pprint('Following that, turn off your device and back on again to activate the new permissions.', sys.stderr)
# get device file name
if not DEV_NODE:
dev = getDeviceFile(DEV_CLASS, DEV_VENDOR, DEV_PRODUCT)
else:
dev = DEV_NODE
if not dev: die("The device '%s' could not be found on this system." % DEV_NAME)
# create dymo labeler object
lm = DymoLabeler(dev)
if not lm: die(access_error(dev))
# check for any text specified on the command line
labeltext = [arg.decode(sys.stdin.encoding) for arg in sys.argv[1:]]
if len(labeltext) == 0: die("No label text was specified.")
# create an empty label image
labelheight = lm.maxBytesPerLine * 8
lineheight = float(labelheight) / len(labeltext)
fontsize = int(round(lineheight * FONT_SIZERATIO))
font = ImageFont.truetype(FONT_FILENAME, fontsize)
labelwidth = max(font.getsize(line)[0] for line in labeltext)
labelbitmap = Image.new('1', (labelwidth, labelheight))
# write the text into the empty image
labeldraw = ImageDraw.Draw(labelbitmap)
for i, line in enumerate(labeltext):
lineposition = int(round(i * lineheight))
labeldraw.text((0, lineposition), line, font=font, fill=255)
del labeldraw
# convert the image to the proper matrix for the dymo labeler object
labelrotated = labelbitmap.transpose(Image.ROTATE_270)
labelstream = labelrotated.tostring()
labelstreamrowlength = labelheight/8 + (1 if labelheight%8 != 0 else 0)
if len(labelstream)/labelstreamrowlength != labelwidth: die('An internal problem was encountered while processing the label bitmap!')
labelrows = [labelstream[i:i+labelstreamrowlength] for i in range(0, len(labelstream), labelstreamrowlength)]
labelmatrix = [array.array('B', labelrow).tolist() for labelrow in labelrows]
# optimize the matrix for the dymo label printer
dottab = 0
while max(line[0] for line in labelmatrix) == 0:
labelmatrix = [line[1:] for line in labelmatrix]
dottab += 1
for line in labelmatrix:
while len(line) > 0 and line[-1] == 0:
del line[-1]
# print the label
lm.printLabel(labelmatrix, dottab)
FONT_FILENAME = '/usr/share/fonts/truetype/ttf-bitstream-vera/Vera.ttf'
// should be changed to path to the font on your system
won't work because of filesystem differences.
searchdir = '/sys/bus/hid/devices'
// take a look at "pywinusb" library (?)
won't work either, you have to get the devices in a different way. Not sure from where though. The same problem is
filepath = '/dev/char/%d:%d' % (devnums[0], devnums[1])
this isn't accessible in Windows and you have to do in a different way.
Besides that everything else looks OS independent. If you have any errors after fixing previous 3 problems, then edit them into your question please.

Categories

Resources