Query data from cloud firestore - Python - python

I have a query that I use to fetch data from cloud firestore .but it throws me a key error. Not sure what is wrong.the code below.
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import pandas as pd
import sys
import os
import fileinput
# Use the application default credentials
cred = credentials.Certificate('creddatabase.json')
firebase_admin.initialize_app(cred)
db = firestore.client()
docs = db.collection(u'testcollection').stream()
users_ref = db.collection(u'testcollection')
docs = users_ref.stream()
array = []
is_header_printed = False
rows_keys = None
out_file = open("testcollection.csv","w")
for doc in docs:
print(doc.id)
row_items = doc.to_dict()
print(doc)
if(is_header_printed == False):
rows_keys = list(row_items.keys())
rows_keys.sort(reverse=False)
#loop through every key and print the header row
for key in rows_keys:
print(key)
out_file.write(str(key.replace(",","_"))+",")
out_file.write("\n")
is_header_printed = True
for key in rows_keys:
print(key)
out_file.write(str(row_items[key]).replace(',',' ')+',')
out_file.write("\n")
print(str(doc.to_dict()['StartDate'].year) + '/' + str(doc.to_dict()['StartDate'].month) + '/' + str(doc.to_dict()['StartDate'].day))
out_file.close()
My document in collection looks like this:
Comments: ""
Objectived: "test"
Media: "test media"
notes: "test notest"
Zone A: 0
Zone B: 0
Error message:
Traceback (most recent call last):
File "/Users/DataScience/Firebasequery/Firebase_query_for_psolzsol_jnhibtionsingalling.py", line 49, in <module>
out_file.write(str(row_items[key]).replace(',',' ')+',')
KeyError: 'Media'

Related

Traceback <module> from extractdocx import * <module> from docx import opendocx, getdocumenttext from exceptions import PendingDeprecationWarning

Traceback (most recent call last):
File "C:\xampp\htdocs\Plag\scripts\main.py", line 8, in
from extractdocx import *
File "C:\xampp\htdocs\Plag\scripts\extractdocx.py", line 18, in
from docx import opendocx, getdocumenttext
File "C:\Users\zeesh\AppData\Local\Programs\Python\Python39\lib\site-packages\docx.py", line 30, in
from exceptions import PendingDeprecationWarning
ModuleNotFoundError: No module named 'exceptions'
# -*- coding: utf-8 -*-
# Master script for the plagiarism-checker
# Coded by: Shashank S Rao
#import other modules
from cosineSim import *
from htmlstrip import *
from extractdocx import *
#import required modules
import codecs
import traceback
import sys
import operator
import urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse
import json as simplejson
# Given a text string, remove all non-alphanumeric
# characters (using Unicode definition of alphanumeric).
def getQueries(text,n):
import re
sentenceEnders = re.compile('[.!?]')
sentenceList = sentenceEnders.split(text)
sentencesplits = []
for sentence in sentenceList:
x = re.compile(r'\W+', re.UNICODE).split(sentence)
x = [ele for ele in x if ele != '']
sentencesplits.append(x)
finalq = []
for sentence in sentencesplits:
l = len(sentence)
l=l/n
index = 0
for i in range(0,l):
finalq.append(sentence[index:index+n])
index = index + n-1
if index !=len(sentence):
finalq.append(sentence[len(sentence)-index:len(sentence)])
return finalq
# Search the web for the plagiarised text
# Calculate the cosineSimilarity of the given query vs matched content on google
# This is returned as 2 dictionaries
def searchWeb(text,output,c):
try:
text = text.encode('utf-8')
except:
text = text
query = urllib.parse.quote_plus(text)
if len(query)>60:
return output,c
#using googleapis for searching web
base_url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q='
url = base_url + '%22' + query + '%22'
request = urllib.request.Request(url,None,{'Referer':'Google Chrome'})
response = urllib.request.urlopen(request)
results = simplejson.load(response)
try:
if ( len(results) and 'responseData' in results and 'results' in results['responseData'] and results['responseData']['results'] != []):
for ele in results['responseData']['results']:
Match = results['responseData']['results'][0]
content = Match['content']
if Match['url'] in output:
#print text
#print strip_tags(content)
output[Match['url']] = output[Match['url']] + 1
c[Match['url']] = (c[Match['url']]*(output[Match['url']] - 1) + cosineSim(text,strip_tags(content)))/(output[Match['url']])
else:
output[Match['url']] = 1
c[Match['url']] = cosineSim(text,strip_tags(content))
except:
return output,c
return output,c
# Use the main function to scrutinize a file for
# plagiarism
def main():
# n-grams N VALUE SET HERE
n=9
if len(sys.argv) <3:
print ("Usage: python main.py <input-filename>.txt <output-filename>.txt")
sys.exit()
if sys.argv[1].endswith(".docx"):
t = docxExtract(sys.argv[1])
else:
t=open(sys.argv[1],'r')
if not t:
print ("Invalid Filename")
print ("Usage: python main.py <input-filename>.txt <output-filename>.txt")
sys.exit()
t=t.read()
queries = getQueries(t,n)
q = [' '.join(d) for d in queries]
found = []
#using 2 dictionaries: c and output
#output is used to store the url as key and number of occurences of that url in different searches as value
#c is used to store url as key and sum of all the cosine similarities of all matches as value
output = {}
c = {}
i=1
count = len(q)
if count>100:
count=100
for s in q[:100]:
output,c=searchWeb(s,output,c)
msg = "\r"+str(i)+"/"+str(count)+"completed..."
sys.stdout.write(msg);
sys.stdout.flush()
i=i+1
#print ("\n")
f = open(sys.argv[2],"w")
for ele in sorted(iter(c.items()),key=operator.itemgetter(1),reverse=True):
f.write(str(ele[0])+" "+str(ele[1]*100.00))
f.write("\n")
f.close()
print ("\nDone!")
if __name__ == "__main__":
try:
main()
except:
#writing the error to stdout for better error detection
error = traceback.format_exc()
print(("\nUh Oh!\n"+"Plagiarism-Checker encountered an error!:\n"+error)) ```
docx, last release was in 2014. The code imports module exceptions that was a top-level module in Python 2.7 but was removed in Python 3:
$ python2.7 -c "import exceptions"
$ python3.7 -c "import exceptions"
Traceback (most recent call last):
File "<string>", line 1, in <module>
ModuleNotFoundError: No module named 'exceptions'
The bottom line: the package is only for Python 2. Use Python 2.7 or find a different package.

Twitter+Apache Kafka+ Spark Structured Streaming doesn't work

I want to work some example codes from github(https://github.com/kaantas/spark-twitter-sentiment-analysis). I follow steps below;
Started zkserver
Started kafka 2.5.0 version (also i am using apache spark 3.0.0 and jdk 8)
Started tweeetlistener.py (tweets start to stream, i can see the tweet cmd window)
I open the twitter_topic_avg_sentiment_val.py with Spyder and it just shows bottom text
Note: i dont know any idea about jars, if i will use external jar, please explaing how?
THANKS A LOT...
Traceback (most recent call last):
File "C:\Users\merha\Desktop\spark-twitter-sentiment-analysis-master\twitter_topic_avg_sentiment_val.py", line 40, in <module>
query.awaitTermination()
File "C:\Anaconda3\lib\site-packages\pyspark\sql\streaming.py", line 103, in awaitTermination
return self._jsq.awaitTermination()
File "C:\Anaconda3\lib\site-packages\py4j\java_gateway.py", line 1305, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "C:\Anaconda3\lib\site-packages\pyspark\sql\utils.py", line 137, in deco
raise_from(converted)
File "<string>", line 3, in raise_from
StreamingQueryException: org/apache/spark/kafka010/KafkaConfigUpdater
=== Streaming Query ===
Identifier: [id = f5dd9cb5-fcea-42ec-a20e-93a2ad233e1f, runId = 6cffdd89-3792-4500-a508-e4abc76425fb]
Current Committed Offsets: {}
Current Available Offsets: {}
Current State: INITIALIZING
Thread State: RUNNABLE
------------------<<<<<<<<<<<<<<<<<<tweet_listener.py>>>>------------------------
from tweepy import Stream
from tweepy.streaming import StreamListener
import json
import twitter_config
import pykafka
from afinn import Afinn
import sys
from sys import exit
class TweetListener(StreamListener):
def __init__(self):
self.client = pykafka.KafkaClient("localhost:9092")
self.producer = self.client.topics[bytes('twitter3','ascii')].get_producer()
def on_data(self, data):
try:
json_data = json.loads(data)
send_data = '{}'
json_send_data = json.loads(send_data)
json_send_data['text'] = json_data['text']
json_send_data['senti_val']=afinn.score(json_data['text'])
print(json_send_data['text'], " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ", json_send_data['senti_val'])
self.producer.produce(bytes(json.dumps(json_send_data),'ascii'))
return True
except KeyError:
return True
def on_error(self, status):
print(status)
return True
consumer_key = "xxxxxxxxxx"
consumer_secret = "xxxxxxxxxxx"
access_token = "xxxxxxxxxxxx"
access_secret = "xxxxxxxxxx"
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
# create AFINN object for sentiment analysis
afinn = Afinn()
twitter_stream = Stream(auth, TweetListener())
twitter_stream.filter(languages=['en'], track=["big data"])
----------------------<<<twitter_topic_avg_sentiment_val.py>>>>>>---------------
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import json
import sys
from pyspark.sql.types import *
def fun(avg_senti_val):
try:
if avg_senti_val < 0: return 'NEGATIVE'
elif avg_senti_val == 0: return 'NEUTRAL'
else: return 'POSITIVE'
except TypeError:
return 'NEUTRAL'
if __name__ == "__main__":
schema = StructType([
StructField("text", StringType(), True),
StructField("senti_val", DoubleType(), True)
])
spark = SparkSession.builder.appName("TwitterSentimentAnalysis") .getOrCreate()
kafka_df = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "twitter3").option("startingOffsets", "earliest").load()
kafka_df_string = kafka_df.selectExpr("CAST(value AS STRING)")
tweets_table = kafka_df_string.select(from_json(col("value"), schema).alias("data")).select("data.*")
sum_val_table = tweets_table.select(avg('senti_val').alias('avg_senti_val'))
# udf = USER DEFINED FUNCTION
udf_avg_to_status = udf(fun, StringType())
# avarage of senti_val column to status column
new_df = sum_val_table.withColumn("status", udf_avg_to_status("avg_senti_val"))
query = kafka_df_string.writeStream.format("console").option("truncate","false").start()
query.awaitTermination()```
after I downloaded and copy this jar file
spark-token-provider-kafka-0-10
to spark jars folder (or add it to Spark_CLASSPATH), my problem resolved.
Have you ever submitted spark with kafka package as a configuration? See the third line.
spark-submit --master yarn --deploy-mode cluster \
--py-files "${PY_ZIP}" \
--packages "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1" \

Cloud firestore exceptions not caught by except block

import datetime
from threading import Timer
import firebase_admin
from firebase_admin import firestore
import calendar
db = firestore.Client()
col_ref = db.collection(u'tblAssgin').get()
current_serving = [doc.id for doc in col_ref]
#print(current_serving)
def sit_time():
for i in current_serving:
try:
doc_ref = db.collection(u'tblAssgin').document(i)
except:
current_serving.remove(doc_ref)
else:
doc = doc_ref.get()
a = doc.get('assginTime')
assign_time = datetime.datetime.fromtimestamp(calendar.timegm(a.timetuple()))
now = datetime.datetime.now()
sitting_time = now - assign_time
hours,remainder = divmod(sitting_time.seconds, 3600)
minutes, seconds = divmod(remainder, 60)
print('minutes:',minutes)
updates = {u'sitting_time':minutes}
doc_ref.update(updates)
t = None
def refresh():
global t
sit_time()
t = Timer(60, refresh)
t.daemon = True
t.start()
refresh()
So basically the above code does that it first fetches all the document id's of collection name 'tblAssgin' and store it in 'current_serving' list. Then, loops over each document and calculate time and runs again after every 60 sec. Now suppose I delete one document then that document will not be found. So I want to do that when the document is not found exception is raised and that document id gets's removed from 'current_serving' list. But the exception is not caught.
Please help
Thanks in advance..!!
You're assuming CollectionReference.document() will throw an exception if the document does not exist. It doesn't.
>>> client.collection('non-existing').document('also-non-existing')
<google.cloud.firestore_v1beta1.document.DocumentReference object at 0x10feac208>
But DocumentReference.get() will throw an exception if the document doesn't exist.
>>> client.collection('non-existing').document('also-non-existing').get()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "~/gae/lib/python3.6/site-packages/google/cloud/firestore_v1beta1/document.py", line 432, in get
raise exceptions.NotFound(self._document_path)
google.api_core.exceptions.NotFound: 404 ~/databases/(default)/documents/non-existing/also-non-existing

how to return a collection from mongodb using pymongo

I'm trying to create a Collection Class in Python to access the various collections in my db. Here's what I've got:
import sys
import os
import pymongo
from pymongo import MongoClient
class Collection():
client = MongoClient()
def __init__(self, db, collection_name):
self.db = db
self.collection_name = collection_name
# self.data_base = getattr(self.client, db)
# self.collObject = getattr(self.data_base, self.collection_name)
def getCollection(self):
data_base = getattr(self.client, self.db)
collObject = getattr(data_base, self.collection_name)
return collObject
def getCollectionKeys(self, collection):
"""Get a set of keys from a collection"""
keys_list = []
collection_list = collection.find()
for document in collection_list:
for field in document.keys():
keys_list.append(field)
keys_set = set(keys_list)
return keys_set
if __name__ == '__main__':
print"Begin Main"
agents = Collection('hkpr_restore','agents')
print "agents is" , agents
agents_collection = agents.getCollection
print agents_collection
print agents.getCollectionKeys(agents_collection)
I get the following output:
Begin Main
agents is <__main__.Collection instance at 0x10ff33e60>
<bound method Collection.getCollection of <__main__.Collection instance at 0x10ff33e60>>
Traceback (most recent call last):
File "collection.py", line 52, in <module>
print agents.getCollectionKeys(agents_collection)
File "collection.py", line 35, in getCollectionKeys
collection_list = collection.find()
AttributeError: 'function' object has no attribute 'find'
The function getCollectionKeys works fine outside of a class. What am I doing wrong?
This line:
agents_collection = agents.getCollection
Should be:
agents_collection = agents.getCollection()
Also, you don't need to use getattr the way you are. Your getCollection method can be:
def getCollection(self):
return self.client[self.db][self.collection_name]

Getting the Profile Feed from Google Apps

I am trying to get the profiles feed from my Google Apps domain using the gdata library supplied my Google for Python. This is my code
import atom
import gdata.auth
import gdata.contacts
import gdata.contacts.service
gd_client = gdata.contacts.service.ContactsService()
gd_client.email = 'name#domain.com'
gd_client.password = 'password'
gd_client.source = 'madeupgibberish'
gd_client.account_type = 'HOSTED'
gd_client.contact_list = 'domain.com'
gd_client.ProgrammaticLogin()
def PrintFeed(feed):
for i, entry in enumerate(feed.entry):
print '\n%s %s' % (i+1, entry.title.text)
max_results = raw_input(
'Enter max return: ')
feed_uri = gd_client.GetProfilesFeed()
query = gdata.contacts.service.ContactsQuery(feed_uri)
print(feed_uri)
query.max_results = max_results
#query.orderby='title'
feed = gd_client.GetContactsFeed(query.ToUri())
# Use the print feed method defined above.
PrintFeed(feed)
print(feed_uri)
#print feed
f = open('c:\\python27\\junk.xml', 'w')
f.write(str(feed))
f.close()
When I run this it returns:
C:\Python27\Lib\gdata-2.0.16>python contactAPI.py
Enter max return: 300
Traceback (most recent call last):
File "contactAPI.py", line 27, in <module>
feed_uri = gd_client.GetProfilesFeed()
File "build\bdist.win-amd64\egg\gdata\contacts\service.py", line 294, in GetProfilesFeed
File "build\bdist.win-amd64\egg\gdata\service.py", line 1108, in Get
gdata.service.RequestError: {'status': 403, 'body': 'Version 1.0 is not supported.', 'reason': 'Forbidden'}
I am able to use GetContactsFeed and other feeds, but I cannot get profiles. Any idea whats happening here or what I need to fix? Thank you in advance for your help.
The gdata.contacts.service uses the deprecated version of the API. You should use gdata.contacts.{client, data} instead}
Here is a sample getting users profiles.
import atom
import gdata.auth
import gdata.contacts
import gdata.contacts.client
email = 'admin#domain.com'
password = 'password'
domain = 'domain.com'
gd_client = gdata.contacts.client.ContactsClient(domain=domain)
gd_client.ClientLogin(email, password, 'madeupgibberish')
def PrintFeed(feed):
for i, entry in enumerate(feed.entry):
print '\n%s %s' % (i+1, entry.title.text)
feed_link = atom.data.Link(gd_client.GetFeedUri(kind='profiles'))
while feed_link:
profiles_feed = gd_client.GetProfilesFeed(uri=feed_link.href)
PrintFeed(profiles_feed)
feed_link = profiles_feed.GetNextLink()
The library's contact_sample.py and unshare_profiles.py work with the client, data files.

Categories

Resources