pymongo- upsert not able to perform insertion with $set operation - python

I am having an empty collection and have thousands of entries to process (entries might have redudancy for which I want to use both updates and inserts).
The python code (using pymongo) I wrote:
for mydoc in alldocs:
key = {'myid': mydoc['myid']}
data = process_doc(mydoc) # returns simple dictionary
db.mydocs.update(key, {"$set": data}, upsert = True)
The following code is unable to perform any insert operations. The collection still remains empty. But when I remove $set and use simply data, it works fine. Can't I use $set in upsert? The reason why I want $set was so that pre-existing fields for a BSON doesn't get affected. Can someone please guide. I really can't figure out what to do.
Reproducable code:
from pymongo import Connection
DB_CONTENT_BASE_KEY = 'contentbase'
def connect_to_db(dbname, hostname = 'localhost', portno = 27017, **kwargs):
connection = Connection(hostname, portno)
dbConnection = connection[dbname]
return dbConnection
class MetawebCustomCollectionBuilder(object):
# key ought to be a dictionary to filter results from contentbase.
def __init__(self, inDbConfig, outDbConfig, key = {}, verbose = False):
self.verbose = verbose
self.inDbConfig = inDbConfig
self.inDb = connect_to_db(**inDbConfig)
self.outDbConfig = outDbConfig
self.outDb = connect_to_db(**outDbConfig)
self.inDbContentBase = self.inDb[self.inDbConfig[DB_CONTENT_BASE_KEY]]
self.outDbContentBase = self.outDb[self.outDbConfig[DB_CONTENT_BASE_KEY]]
self.key = key
self.in_db_collection_constraints()
self.out_db_collection_constraints()
def in_db_collection_constraints(self):
self.inDbContentBase.ensure_index('mid')
if self.verbose: print("Assured index on mid for inDbContentBase...")
def out_db_collection_constraints(self):
self.outDbContentBase.ensure_index('mid')
if self.verbose: print("Assured index on mid for outDbContentBase...")
def process_in_record(self, inRecord):
outRecord = inRecord # [YET TO] continue from here...
return outRecord
def transit_collection(self):
for record in self.inDbContentBase.find(self.key):
outRecord = self.process_in_record(record)
key = {'mid':outRecord['mid']}
data = outRecord
print key
self.outDbContentBase.update(key, {"$set": data}, True)
if self.verbose: print 'Done with transiting collection from in DB to out DB'
def cleanup_out_collection(self):
pass
def in_db_sandbox(self):
# To have tests and analytics placed in here corresponding to inDb.
pass
if __name__ == '__main__':
inDbConfig = {'dbname':'metaweb', 'contentbase': 'content'}
outDbConfig = {'dbname': 'similarkind', 'contentbase': 'content'}
mccb = MetawebCustomCollectionBuilder(inDbConfig, outDbConfig, verbose = True)
mccb.transit_collection()
There must be a prexisting database inDb. From this collection I want to create a new modified collection.

Your claim is wrong
>>> import pymongo
>>> c = pymongo.Connection()
>>> db = c.mydb
>>> db.mydocs.find().count()
0
>>> db.mydocs.update({'myid': '438'}, {"$set": {'keyA':'valueA'}}, upsert = True)
>>> db.mydocs.find().count()
1
>>> db.mydocs.find_one()
{u'myid': u'438', u'keyA': u'valueA', u'_id': ObjectId('504c2fd1a694cc9624bbd6a2')}

Related

Mock not overriding the return of a function in Python

I am implementing unit test on one of the classes of my project. The method that I want to test is queryCfsNoteVariations:
class PdfRaportDaoImpl:
def queryCfsNoteVariations(self, reportId):
sql = """
select v.* from item_value_table v
where v.table_id in
(select table_id from table_table t
where t.report_id=%s and table_name='CFS')
"""
cfsItemList = dbFind(sql, (reportId))
sql = "select * from variations_cfs_note"
cfsNoteVariations = dbFind(sql)
if cfsNoteVariations == None or len(cfsNoteVariations) == 0:
raise Exception("cfs note variations is null!")
cfsNoteVariationList = []
for itemInfo in cfsItemList:
for cfsNoteVariation in cfsNoteVariations:
if (
cfsNoteVariation["item_name_cfs"].lower()
== itemInfo["item_name"].lower()
):
cfsNoteVariationList.append(cfsNoteVariation["item_name_cfs_note"])
if len(cfsNoteVariationList) > 0:
return cfsNoteVariationList, itemInfo["item_note"]
return None, None
Which has a path: /com/pdfgather/PDFReportDao.py
In my test I am doing patch on dbFind() method which is located in /com/pdfgather/GlobalHelper.py. My current test looks like this:
from com.pdfgather.PDFReportDao import PdfReportDaoImpl
#patch("com.pdfgather.GlobalHelper.dbFind")
def test_query_cfs_note_variations(self, mock_find):
mock_find.side_effect = iter([
[{"item_name" : "foo"}, {"item_name" : "hey"}],
[{"item_name_cfs": "foo"},
{"item_name_cfs": "foo"},
{"item_name_cfs": "hey"}]]
])
report_id = 3578
result = TestingDao.dao.queryCfsNoteVariations(report_id)
# Printing result
print(result)
However I am not getting my desired result which is getting inside a loop and returning from inside a loop. Instead the dbFind is returning nothing (but it shouldn't as I already preassigned returning values for dbFind).
Thanks in advance!
Python refers com.pdfgather.PDFReportDao.dbFind and com.pdfgather.GlobalHelper.dbFind as two different classes. The second one is the import you want to patch. Try changing your patch to:
#patch("com.pdfgather.PDFReportDao.dbFind")

python cassandra get big result of select * in generator (without storage result in ram)

I want to get all data in cassandra table "user"
i have 840000 users and i don't want to get all users in python list.
i want get users in packs of 100 users
in cassandra doc https://datastax.github.io/python-driver/query_paging.html
i see i can use fetch_size, but in my python code i have database object that contains all cql instruction
from cassandra.cluster import Cluster
from cassandra.query import SimpleStatement
class Database:
def __init__(self, name, salary):
self.cluster = Cluster(['192.168.1.1', '192.168.1.2'])
self.session = cluster.connect()
def get_users(self):
users_list = []
query = "SELECT * FROM users"
statement = SimpleStatement(query, fetch_size=10)
for user_row in session.execute(statement):
users_list.append(user_row.name)
return users_list
actually get_users return very big list of user name
but i want to transform return get_users to a "generator"
i don't want get all users name in 1 list and 1 call of function get_users, but i want to have lot of call get_users and return list with only 100 users max every call function
for example :
list1 = database.get_users()
list2 = database.get_users()
...
listn = database.get_users()
list1 contains 100 first user in query
list2 contains 100 "second" users in query
listn contains the latest elements in query (<=100)
is this possible ?
thanks for advance for your answer
According to Paging Large Queries:
Whenever there are no more rows in the current page, the next page
will be fetched transparently.
So, if you execute your code like this, you will still the whole result set, but this is paged in a transparent manner.
In order to achieve what you need to use callbacks. You can also find some code sample on the link above.
I added below the full code for reference.
from cassandra.cluster import Cluster
from cassandra.query import SimpleStatement
from threading import Event
class PagedResultHandler(object):
def __init__(self, future):
self.error = None
self.finished_event = Event()
self.future = future
self.future.add_callbacks(
callback=self.handle_page,
errback=self.handle_error)
def handle_page(self, rows):
for row in rows:
process_row(row)
if self.future.has_more_pages:
self.future.start_fetching_next_page()
else:
self.finished_event.set()
def handle_error(self, exc):
self.error = exc
self.finished_event.set()
def process_row(user_row):
print user_row.name, user_row.age, user_row.email
cluster = Cluster()
session = cluster.connect()
query = "SELECT * FROM myschema.users"
statement = SimpleStatement(query, fetch_size=5)
future = session.execute_async(statement)
handler = PagedResultHandler(future)
handler.finished_event.wait()
if handler.error:
raise handler.error
cluster.shutdown()
Moving to next page is done in handle_page when start_fetching_next_page is called.
If you replace the if statement with self.finished_event.set() you will see that the iteration stops after the first 5 rows as defined in fetch_size

SparkStreaming: How to get list like collect()

I am beginner of SparkStreaming.
I want to load HBase record at SparkStreaming App.
So, I write the the under code by python.
My "load_records" function is getting HBase Records and return the records.
SparkStreaming can not use collect(). sc.newAPIHadoopRDD() need to be used at Driver Program. But SparkStreaming do not have the method which can get objects from workers to driver.
How to get HBase Record at SparkStreaming? or How to call sc.newAPIHadoopRDD()?
def load_records(sc, table, keys):
host = 'localhost'
keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter"
valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter"
rdd_list = []
for key in keys:
if table == "user":
conf = {"hbase.zookeeper.quorum": host, "hbase.mapreduce.inputtable": "user",
"hbase.mapreduce.scan.columns": "u:uid",
"hbase.mapreduce.scan.row.start": key, "hbase.mapreduce.scan.row.stop": key + "\x00"}
rdd = sc.newAPIHadoopRDD("org.apache.hadoop.hbase.mapreduce.TableInputFormat",
"org.apache.hadoop.hbase.io.ImmutableBytesWritable",
"org.apache.hadoop.hbase.client.Result",
keyConverter=keyConv, valueConverter=valueConv, conf=conf)
rdd_list.append(rdd)
first_rdd = rdd_list.pop(0)
for rdd in rdd_list:
first_rdd = first_rdd.union(rdd)
return first_rdd
sc = SparkContext(appName="UserStreaming")
ssc = StreamingContext(sc, 3)
topics = ["json"]
broker_list = "localhost:9092"
inputs = KafkaUtils.createDirectStream(ssc, topics, {"metadata.broker.list": broker_list})
jsons = inputs.map(lambda input: json.loads(input[1]))
user_id_rdd = jsons.map(lambda json: json["user_id"])
# the under line is not working. Any another methods?
user_id_list = user_id_rdd.collect()
user_record_rdd = load_records(sc, 'user', user_id_list)

Python - Getting Attributes From A File of Constants

I have a file of constant variables that I need to query and I am not sure how to go about it.
I have a database query which is returning user names and I need to find the matching user name in the file of constant variables.
The file looks like this:
SALES_MANAGER_01 = {"user_name": "BO01", "password": "password", "attend_password": "BO001",
"csm_password": "SM001", "employee_num": "BOSM001"}
There is just a bunch of users just like the one above.
My function looks like this:
#attr("user_test")
def test_get_user_for_login(self):
application_code = 'BO'
user_from_view = self.select_user_for_login(application_code=application_code)
users = [d['USER'] for d in user_from_view]
user_with_ent = choice(users)
user_wo_ent = user_with_ent[-4:]
password = ""
global_users = dir(gum)
for item in global_users:
if user_wo_ent not in item.__getattr__("user_name"):
user_with_ent = choice(users)
user_wo_ent = user_with_ent[-4:]
else:
password = item.__getattr__("password")
print(user_wo_ent, password)
global_users = dir(gum) is my file of constants. So I know I am doing something wrong since I am getting an attribute error AttributeError: 'str' object has no attribute '__getattr__', I am just not sure how to go about resolving it.
You should reverse your looping as you want to compare each item to your match condition. Also, you have a dictionary, so use it to do some heavy lifting.
You need to add some imports
import re
from ast import literal_eval
I've changed the dir(gum) bit to be this function.
def get_global_users(filename):
gusers = {} # create a global users dict
p_key = re.compile(ur'\b\w*\b') # regex to get first part, e.g.. SALES_MANAGER_01
p_value = re.compile(ur'\{.*\}') # regex to grab everything in {}
with (open(filename)) as f: # open the file and work through it
for line in f: # for each line
gum_key = p_key.match(line) # pull out the key
gum_value = p_value.search(line) # pull out the value
''' Here is the real action. update a dictionary
with the match of gum_key and with match of gum_value'''
gusers[gum_key.group()] = literal_eval(gum_value.group())
return(gusers) # return the dictionary
The bottom of your existing code is replaced with this.
global_users = get_global_users(gum) # assign return to global_users
for key, value in global_users.iteritems(): # walk through all key, value pairs
if value['user_name'] != user_wo_ent:
user_with_ent = choice(users)
user_wo_ent = user_with_ent[-4:]
else:
password = value['password']
So a very simple answer was get the dir of the constants file then parsing over it like so:
global_users = dir(gum)
for item in global_users:
o = gum.__dict__[item]
if type(o) is not dict:
continue
if gum.__dict__[item].get("user_name") == user_wo_ent:
print(user_wo_ent, o.get("password"))
else:
print("User was not in global_user_mappings")
I was able to find the answer by doing the following:
def get_user_for_login(application_code='BO'):
user_from_view = BaseServiceTest().select_user_for_login(application_code=application_code)
users = [d['USER'] for d in user_from_view]
user_with_ent = choice(users)
user_wo_ent = user_with_ent[4:]
global_users = dir(gum)
user_dict = {'user_name': '', 'password': ''}
for item in global_users:
o = gum.__dict__[item]
if type(o) is not dict:
continue
if user_wo_ent == o.get("user_name"):
user_dict['user_name'] = user_wo_ent
user_dict['password'] = o.get("password")
return user_dict

How to mock a redis client in Python?

I just found that a bunch of unit tests are failing, due a developer hasn't mocked out the dependency to a redis client within the test. I'm trying to give a hand in this matter but have difficulties myself.
The method writes to a redis client:
redis_client = get_redis_client()
redis_client.set('temp-facility-data', cPickle.dumps(df))
Later in the assert the result is retrieved:
res = cPickle.loads(get_redis_client().get('temp-facility-data'))
expected = pd.Series([set([1, 2, 3, 4, 5])], index=[1])
assert_series_equal(res.variation_pks, expected)
I managed to patch the redis client's get() and set() successfully.
#mock.patch('redis.StrictRedis.get')
#mock.patch('redis.StrictRedis.set')
def test_identical(self, mock_redis_set, mock_redis_get):
mock_redis_get.return_value = ???
f2 = deepcopy(self.f)
f3 = deepcopy(self.f)
f2.pk = 2
f3.pk = 3
self.one_row(f2, f3)
but I don't know how to set the return_value of get() to what the set() would set in the code, so that the test would pass.
Right now this line fails the test:
res = cPickle.loads(get_redis_client().get('temp-facility-data'))
TypeError: must be string, not MagicMock
Any advice please?
Think you can use side effect to set and get value in a local dict
data = {}
def set(key, val):
data[key] = val
def get(key):
return data[key]
mock_redis_set.side_effect = set
mock_redis_get.side_effect = get
not tested this but I think it should do what you want
If you want something more complete, you can try fakeredis
#patch("redis.Redis", return_value=fakeredis.FakeStrictRedis())
def test_something():
....
I think you can do something like this.
redis_cache = {
"key1": (b'\x80\x04\x95\x08\x00\x00\x00\x00\x00\x00\x00\x8c\x04test\x94.', "test"),
"key2": (None, None),
}
def get(redis_key):
if redis_key in redis_cache:
return redis_cache[redis_key][0]
else:
return None
mock = MagicMock()
mock.get = Mock(side_effect=get)
with patch('redis.StrictRedis', return_value=mock) as p:
for key in redis_cache:
result = self.MyClass.my_function(key)
self.assertEqual(result, redis_cache[key][1])

Categories

Resources