I am currently working on a project where I want to extract the tweet text and the time of creation and put this data in a csv file. The files I am analysing are large text files (~800MB-1.5GB) containing JSON data. I have used the below program to get this data. I have piped the output of this into a text file.
import tweepy as tp
import sys
import pandas as pd
#Variables that contains the user credentials to access Twitter API
access_token = "..."
access_token_secret = "..."
consumer_key = "..."
consumer_secret = "..."
tweets_data = []
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(tp.StreamListener):
def on_data(self, data):
print (data)
return True
def on_error(self, status):
print (status)
if __name__ == '__main__':
#This handles Twitter authentication and the connection to Twitter Streaming API
l = StdOutListener()
auth = tp.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = tp.Stream(auth, l)
stream.filter(track=['Manchester United'])
EDIT: Here is a sample output of the above program.
{"created_at":"Mon Feb 09 07:58:51 +0000 2015","id":564694906233307137,"id_str":"564694906233307137","text":"RT #ManUtd: Take an alternative look at United's starting line-up today, courtesy of #MUTV. #mufclive\nhttps:\/\/t.co\/m1n1JkgRYq","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":306297595,"id_str":"306297595","name":"Agus Wiratama","screen_name":"KunirKm","location":"Bali","url":null,"description":"girls that are uniqe and beautiful in their own way|| #GGMU #Libra #IG : #Kunirkm","protected":false,"verified":false,"followers_count":176,"friends_count":102,"listed_count":1,"favourites_count":39,"statuses_count":4810,"created_at":"Fri May 27 16:45:02 +0000 2011","utc_offset":-32400,"time_zone":"Alaska","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"022330","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme15\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme15\/bg.png","profile_background_tile":false,"profile_link_color":"0084B4","profile_sidebar_border_color":"A8C7F7","profile_sidebar_fill_color":"C0DFEC","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/561223265025138688\/J3SFBWV4_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/561223265025138688\/J3SFBWV4_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/306297595\/1400412027","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sun Feb 08 15:52:42 +0000 2015","id":564451764460474369,"id_str":"564451764460474369","text":"Take an alternative look at United's starting line-up today, courtesy of #MUTV. #mufclive\nhttps:\/\/t.co\/m1n1JkgRYq","source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":558797310,"id_str":"558797310","name":"Manchester United","screen_name":"ManUtd","location":"#mufc","url":"http:\/\/www.manutd.com","description":"Official Twitter of Manchester United FC","protected":false,"verified":true,"followers_count":4388116,"friends_count":84,"listed_count":12006,"favourites_count":0,"statuses_count":11840,"created_at":"Fri Apr 20 15:17:43 +0000 2012","utc_offset":0,"time_zone":"Casablanca","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/491881264232677376\/VcPcDO7o.jpeg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/491881264232677376\/VcPcDO7o.jpeg","profile_background_tile":false,"profile_link_color":"B30000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/563854496074194947\/p74gErkN_normal.png","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/563854496074194947\/p74gErkN_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/558797310\/1423268331","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1338,"favorite_count":752,"entities":{"hashtags":[{"text":"MUTV","indices":[73,78]},{"text":"mufclive","indices":[80,89]}],"trends":[],"urls":[{"url":"https:\/\/t.co\/m1n1JkgRYq","expanded_url":"https:\/\/amp.twimg.com\/v\/c79db33a-7fa9-4993-be9d-12990ee17b6b","display_url":"amp.twimg.com\/v\/c79db33a-7fa\u2026","indices":[90,113]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"retweet_count":0,"favorite_count":0,"entities":
I have then tried to read this file to extract the information I need using this program.
import simplejson as json
from pandas import DataFrame as df
import time
if __name__ == "__main__":
tweets_data_path = '/input.txt' #Input file path
tweets_data = []
tweets_file = open(tweets_data_path, "r")
tweets = df()
tweets1 = df()
i=0
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
i+=1
print(i)
if i > 10000:
i=0
tweets['CreatedAt'] = [tweet["created_at"] for tweet in tweets_data]
tweets['text'] = [tweet["text"] for tweet in tweets_data]
print(tweets_data)
timestr = time.strftime("%Y%m%d-%H%M%S")
filename = 'Out' + timestr + '.csv'
print(filename)
tweets.to_csv(filename, index=True)
tweets_data.clear()
tweets.drop()
print(tweets)
#print('Here I am')
except:
continue
try:
#Creating a new data frame as the old one creates a conflict with the size of the index
tweets1['CreatedAt'] = [tweet["created_at"] for tweet in tweets_data]
tweets1['text'] = [tweet["text"] for tweet in tweets_data]
timestr1 = time.strftime("%Y%m%d-%H%M%S")
filename1 = 'Out' + timestr1 + '.csv'
tweets1.to_csv(filename, index= True)
except:
print('Excepti')
The problem is that I am not able to save more than one csv file while the code continues to run for the entire length of the JSON file. Have I made an error in the looping or managing the exceptions?
I am pretty new to Python and programming in general. Appreciate your help.
Related
I try to extreact data from twitter json file retrived by using tweepy streaming
Here is my code for streaming:
class MyListener(Stream):
t_count=0
def on_data(self, data):
print (data)
self.t_count += 0
#stop by
if self.t_count >= 5000:
sys.exit("exit")
return True
def on_error(self, status):
print (status)
if __name__ == '__main__':
stream = MyListener(consumer_key, consumer_secret, access_token, access_token_secret)
stream.filter(track=['corona'], languages = ["en"])
Here is my code for reading the file:
with open("covid-test-out", "r") as f:
count = 0
for line in f:
data = json.loads(line)
Then I got the error
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Here is one line in the json file. I noticed that there is a b-prefix in front of each line but when I check the type of the line, it is not bytes object but still string object. And I am not even sure if this is the reason that I can not get the correct data.
b'{"created_at":"Mon Nov 22 07:37:46 +0000 2021","id":1462686730956333061,"id_str":"1462686730956333061","text":"RT #corybernardi: Scientists 'mystified'. \n\nhttps:\/\/t.co\/rvTYCUEQ74","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eTwitter Web App\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1336870146242056192,"id_str":"1336870146242056192","name":"Terence Byrnes","screen_name":"byrnes_terence","location":null,"url":null,"description":"Retired Aussie. Against mandatory vaccinations, government interference in our lives, and the climate cult. Now on Gab Social as a backup : Terence50","translator_type":"none","protected":false,"verified":false,"followers_count":960,"friends_count":1012,"listed_count":3,"favourites_count":15163,"statuses_count":171876,"created_at":"Thu Dec 10 03:08:01 +0000 2020","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":null,"contributors_enabled":false,"is_translator":false,"profile_background_color":"F5F8FA","profile_background_image_url":"","profile_background_image_url_https":"","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/1428994180458508292\/fT2Olt4J_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/1428994180458508292\/fT2Olt4J_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1336870146242056192\/1631520259","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null,"withheld_in_countries":[]},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sun Nov 21 19:42:14 +0000 2021","id":1462506658421112834,"id_str":"1462506658421112834","text":"Scientists 'mystified'. \n\nhttps:\/\/t.co\/rvTYCUEQ74","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eTwitter Web App\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":80965423,"id_str":"80965423","name":"CoryBernardi.com.au","screen_name":"corybernardi","location":"Adelaide ","url":"http:\/\/www.corybernardi.com.au","description":"Get your free Weekly Dose of Common Sense email at https:\/\/t.co\/MAJpp7iZJy.\n\nLaughing at liars and leftists since 2006. Tweets deleted weekly to infuriate losers.","translator_type":"none","protected":false,"verified":true,"followers_count":47794,"friends_count":63,"listed_count":461,"favourites_count":112,"statuses_count":55,"created_at":"Thu Oct 08 22:54:55 +0000 2009","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":null,"contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/1446336496827387904\/Ay6QRHQt_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/1446336496827387904\/Ay6QRHQt_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/80965423\/1633668973","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null,"withheld_in_countries":[]},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"quote_count":5,"reply_count":30,"retweet_count":40,"favorite_count":136,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/rvTYCUEQ74","expanded_url":"https:\/\/apnews.com\/article\/coronavirus-pandemic-science-health-pandemics-united-nations-fcf28a83c9352a67e50aa2172eb01a2f","display_url":"apnews.com\/article\/corona\u2026","indices":[26,49]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/rvTYCUEQ74","expanded_url":"https:\/\/apnews.com\/article\/coronavirus-pandemic-science-health-pandemics-united-nations-fcf28a83c9352a67e50aa2172eb01a2f","display_url":"apnews.com\/article\/corona\u2026","indices":[44,67]}],"user_mentions":[{"screen_name":"corybernardi","name":"CoryBernardi.com.au","id":80965423,"id_str":"80965423","indices":[3,16]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1637566666722"}'
Today i am working on a project about incoming phone calls being transcripted and getting saved into text files, but i am also kinda new to python and python loops.
I want to loop over a SQL server column and let each row loop trough the azure Speech to text service i use (all of the phonecall OID's). I have been stuck on this problem for a couple days now so i thought i might find some help here.
import azure.cognitiveservices.speech as speechsdk
import time
from os import path
from pydub import AudioSegment
import requests
import hashlib
import sys
import os.path
import pyodbc
databaseName = '*'
username = '*'
password = '*'
server = '*'
driver = '*'
try:
CONNECTION_STRING = 'DRIVER='+driver+';SERVER='+server+';DATABASE='+databaseName+';UID='+username+';PWD='+ password
conn = pyodbc.connect(CONNECTION_STRING)
cursor = conn.cursor()
storedproc = "* = *'"
cursor.execute(storedproc)
row = cursor.fetchone()
while row:
array = [(int(row[1]))]
row = cursor.fetchone()
i = 0
while i<len(array):
OID = (array[i])
i = i + 1
print(OID)
string = f"{OID}*"
encoded = string.encode()
result = hashlib.sha256(encoded)
resultHash = (result.hexdigest())
Telefoongesprek = requests.get(f"*{OID}", headers={f"api-key":f"{resultHash}"})
with open("Telefoongesprek.mp3", "wb") as f:
f.write(Telefoongesprek.content)
src = "Telefoongesprek.mp3"
dst = "Telefoongesprek.wav"
sound = AudioSegment.from_file(src)
sound.export(dst, format="wav")
def speech_recognize_continuous_from_file():
speech_config = speechsdk.SpeechConfig(subscription="*", region="*")
speech_config.speech_recognition_language = "nl-NL"
audio_config = speechsdk.audio.AudioConfig(filename="Telefoongesprek.wav")
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
done = False
def stop_cb(evt):
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
all_results = []
def handle_final_result(evt):
all_results.append(evt.result.text)
speech_recognizer.recognized.connect(handle_final_result)
speech_recognizer.session_started.connect(handle_final_result)
speech_recognizer.session_stopped.connect(handle_final_result)
speech_recognizer.canceled.connect(handle_final_result)
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
print(all_results)
telefoongesprek = str(all_results)
filename = f"C:\\Users\\Beau\\Contact-verkeer\\contact-verkeer\\telefoon\\STT Transcriptions\\Telefoongesprek#{OID}.txt"
file = open(filename, "w")
file.write(telefoongesprek)
file.close()
speech_recognize_continuous_from_file()
cursor.close()
del cursor
conn.close()
except Exception as e:
print("Error: %s" % e)
everything works apart form each other but i just dont know how to place the loop and witch one i should use (For/While loop). right here im trying to loop over an array but i dont this this is correct.
Error message: Decoding failed. ffmpeg returned error code: 1
[mp3 # 000001cb8c57e0o0] Failed to read frame size: could not seek to 1073.
which i am pretty sure means that my azure function can't find an mp3 file, what means that the "Mp3 to Wav" convert doesn't work.
Thanks in advance!
If I understand your question, you have a database with lots of phone call details. One of the field value in each row is used to create the associated mp3 file. You want to do speech to text using azure on each of the mp3 file you have in your database.
So you can do it in two ways:
Iterate though all rows in the database and create all the associted files into a folder in the local disk with the OID as your filename.
Then write another loop to iterate through this folder and send the files for transcription to Azure Speech to Text service.
The other technique is to do everything in a single loop like the way you have shown which will require some corrections.
Ok, so now that part is clear, we can go into the speech to text part. So azure allow you to send the compressed format for transcription, which means you actually don't need to convert it into wav file.
Please have a look at the modified code below with the changes:
# code snippet borrowed from azure samples
def speech_recognize_continuous_from_file(filename):
class BinaryFileReaderCallback(speechsdk.audio.PullAudioInputStreamCallback):
def __init__(self, filename: str):
super().__init__()
self._file_h = open(filename, "rb")
def read(self, buffer: memoryview) -> int:
try:
size = buffer.nbytes
frames = self._file_h.read(size)
buffer[:len(frames)] = frames
return len(frames)
except Exception as ex:
print('Exception in `read`: {}'.format(ex))
raise
def close(self) -> None:
print('closing file')
try:
self._file_h.close()
except Exception as ex:
print('Exception in `close`: {}'.format(ex))
raise
# Creates an audio stream format. For an example we are using MP3 compressed file here
compressed_format = speechsdk.audio.AudioStreamFormat(compressed_stream_format=speechsdk.AudioStreamContainerFormat.MP3)
callback = BinaryFileReaderCallback(filename=filename)
stream = speechsdk.audio.PullAudioInputStream(stream_format=compressed_format, pull_stream_callback=callback)
speech_config = speechsdk.SpeechConfig(subscription="*", region="*")
speech_config.speech_recognition_language = "nl-NL"
audio_config = speechsdk.audio.AudioConfig(stream=stream)
# Creates a speech recognizer using a file as audio input, also specify the speech language
speech_recognizer = speechsdk.SpeechRecognizer(speech_config, audio_config)
done = False
def stop_cb(evt):
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
all_results = []
def handle_final_result(evt):
all_results.append(evt.result.text)
speech_recognizer.recognized.connect(handle_final_result)
speech_recognizer.session_started.connect(handle_final_result)
speech_recognizer.session_stopped.connect(handle_final_result)
speech_recognizer.canceled.connect(handle_final_result)
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
print(all_results)
telefoongesprek = str(all_results)
filename = f"C:\\Users\\Beau\\Contact-verkeer\\contact-verkeer\\telefoon\\STT Transcriptions\\Telefoongesprek#{OID}.txt"
file = open(filename, "w")
file.write(telefoongesprek)
file.close()
try:
CONNECTION_STRING = 'DRIVER='+driver+';SERVER='+server+';DATABASE='+databaseName+';UID='+username+';PWD='+ password
conn = pyodbc.connect(CONNECTION_STRING)
cursor = conn.cursor()
storedproc = "* = *'"
cursor.execute(storedproc)
row = cursor.fetchone()
# loop through the rows
while row:
array = [(int(row[1]))]
i = 0
while i<len(array):
OID = (array[i])
i = i + 1
print(OID)
string = f"{OID}*"
encoded = string.encode()
result = hashlib.sha256(encoded)
resultHash = (result.hexdigest())
telefoongesprek_response = requests.get(f"*{OID}", headers={f"api-key":f"{resultHash}"})
# save the file to local disk as mp3
with open("Telefoongesprek.mp3", "wb") as f:
f.write(telefoongesprek_response.content)
# do the speech to text on the mp3 file
speech_recognize_continuous_from_file(f.name)
# fetch the next row
row = cursor.fetchone()
cursor.close()
del cursor
conn.close()
except Exception as e:
print("Error: %s" % e)
I haven't tested this full code as i don't have the db connections with me. Please fell free to modify for your use case and let me know if you have any issues.
So I am trying my best to navigate my way through the Facebook API. I need to crate a script that will download my business' campaign information daily as a csv file so I can use another script to upload the information to our database easily.
I finally have code that works to print the information to the log, but I am reaching the user request limit because I have to call get_insights() for every single campaign individually. I am wondering if anyone knows how to help me make it so I don't have to call the facebook API as often.
What I would like to do if find a field where I can get the daily spend so I don't have to call the API in every iteration of my for campaign loop, but I cannot for the life of me find a way to do so.
#Import all the facebook mumbo jumbo
from facebookads.api import FacebookAdsApi
from facebookads.adobjects.adset import AdSet
from facebookads.adobjects.campaign import Campaign
from facebookads.adobjects.adsinsights import AdsInsights
from facebookads.adobjects.adreportrun import AdReportRun
from facebookads.adobjects.adaccount import AdAccount
from facebookads.adobjects.business import Business
import time
#Set the login info
my_app_id = '****'
my_app_secret = '****'
my_access_token = '****'
#Start the connection to the facebook API
FacebookAdsApi.init(my_app_id, my_app_secret, my_access_token)
business = Business('****')
#Get all ad accounts on the business account
accounts = business.get_owned_ad_accounts(fields=[AdAccount.Field.id])
#iterate through all accounts in the business account
for account in accounts:
tempaccount = AdAccount(account[AdAccount.Field.id])
#get all campaigns in the adaccount
campaigns = tempaccount.get_campaigns(fields=[Campaign.Field.name,Campaign.Field])
#iterate trough all the campaigns in the adaccount
for campaign in campaigns:
print(campaign[Campaign.Field.name])
#get the insight info (spend) from each campaign
campaignsights = campaign.get_insights(params={'date_preset':'yesterday'},fields=[AdsInsights.Field.spend])
print (campaignsights)
It took a while of digging through the API and guessing but I got it! Here is my final script:
# This program downloads all relevent Facebook traffic info as a csv file
# This program requires info from the Facebook Ads API: https://github.com/facebook/facebook-python-ads-sdk
# Import all the facebook mumbo jumbo
from facebookads.api import FacebookAdsApi
from facebookads.adobjects.adsinsights import AdsInsights
from facebookads.adobjects.adaccount import AdAccount
from facebookads.adobjects.business import Business
# Import th csv writer and the date/time function
import datetime
import csv
# Set the info to get connected to the API. Do NOT share this info
my_app_id = '****'
my_app_secret = '****'
my_access_token = '****'
# Start the connection to the facebook API
FacebookAdsApi.init(my_app_id, my_app_secret, my_access_token)
# Create a business object for the business account
business = Business('****')
# Get yesterday's date for the filename, and the csv data
yesterdaybad = datetime.datetime.now() - datetime.timedelta(days=1)
yesterdayslash = yesterdaybad.strftime('%m/%d/%Y')
yesterdayhyphen = yesterdaybad.strftime('%m-%d-%Y')
# Define the destination filename
filename = yesterdayhyphen + '_fb.csv'
filelocation = "/cron/downloads/"+ filename
# Get all ad accounts on the business account
accounts = business.get_owned_ad_accounts(fields=[AdAccount.Field.id])
# Open or create new file
try:
csvfile = open(filelocation , 'w+', 0777)
except:
print ("Cannot open file.")
# To keep track of rows added to file
rows = 0
try:
# Create file writer
filewriter = csv.writer(csvfile, delimiter=',')
except Exception as err:
print(err)
# Iterate through the adaccounts
for account in accounts:
# Create an addaccount object from the adaccount id to make it possible to get insights
tempaccount = AdAccount(account[AdAccount.Field.id])
# Grab insight info for all ads in the adaccount
ads = tempaccount.get_insights(params={'date_preset':'yesterday',
'level':'ad'
},
fields=[AdsInsights.Field.account_id,
AdsInsights.Field.account_name,
AdsInsights.Field.ad_id,
AdsInsights.Field.ad_name,
AdsInsights.Field.adset_id,
AdsInsights.Field.adset_name,
AdsInsights.Field.campaign_id,
AdsInsights.Field.campaign_name,
AdsInsights.Field.cost_per_outbound_click,
AdsInsights.Field.outbound_clicks,
AdsInsights.Field.spend
]
);
# Iterate through all accounts in the business account
for ad in ads:
# Set default values in case the insight info is empty
date = yesterdayslash
accountid = ad[AdsInsights.Field.account_id]
accountname = ""
adid = ""
adname = ""
adsetid = ""
adsetname = ""
campaignid = ""
campaignname = ""
costperoutboundclick = ""
outboundclicks = ""
spend = ""
# Set values from insight data
if ('account_id' in ad) :
accountid = ad[AdsInsights.Field.account_id]
if ('account_name' in ad) :
accountname = ad[AdsInsights.Field.account_name]
if ('ad_id' in ad) :
adid = ad[AdsInsights.Field.ad_id]
if ('ad_name' in ad) :
adname = ad[AdsInsights.Field.ad_name]
if ('adset_id' in ad) :
adsetid = ad[AdsInsights.Field.adset_id]
if ('adset_name' in ad) :
adsetname = ad[AdsInsights.Field.adset_name]
if ('campaign_id' in ad) :
campaignid = ad[AdsInsights.Field.campaign_id]
if ('campaign_name' in ad) :
campaignname = ad[AdsInsights.Field.campaign_name]
if ('cost_per_outbound_click' in ad) : # This is stored strangely, takes a few steps to break through the layers
costperoutboundclicklist = ad[AdsInsights.Field.cost_per_outbound_click]
costperoutboundclickdict = costperoutboundclicklist[0]
costperoutboundclick = costperoutboundclickdict.get('value')
if ('outbound_clicks' in ad) : # This is stored strangely, takes a few steps to break through the layers
outboundclickslist = ad[AdsInsights.Field.outbound_clicks]
outboundclicksdict = outboundclickslist[0]
outboundclicks = outboundclicksdict.get('value')
if ('spend' in ad) :
spend = ad[AdsInsights.Field.spend]
# Write all ad info to the file, and increment the number of rows that will display
filewriter.writerow([date, accountid, accountname, adid, adname, adsetid, adsetname, campaignid, campaignname, costperoutboundclick, outboundclicks, spend])
rows += 1
csvfile.close()
# Print report
print (str(rows) + " rows added to the file " + filename)
I then have a php script that takes the csv file and uploads it to my database. The key is pulling all the insight data in one big yank. You can then break it up however you want because each ad has information about its adset, adaccount, and campaign.
Adding a couple of small functions to improve on LucyTurtle's answer as it is still susceptible to Facebook's Rate Limiting
import logging
import requests as rq
#Function to find the string between two strings or characters
def find_between( s, first, last ):
try:
start = s.index( first ) + len( first )
end = s.index( last, start )
return s[start:end]
except ValueError:
return ""
#Function to check how close you are to the FB Rate Limit
def check_limit():
def check_limit():
check=rq.get('https://graph.facebook.com/v3.3/act_'+account_number+'/insights?access_token='+my_access_token)
call=float(find_between(check.headers['x-business-use-case-usage'],'call_count":','}'))
cpu=float(find_between(check.headers['x-business-use-case-usage'],'total_cputime":','}'))
total=float(find_between(check.headers['x-business-use-case-usage'],'total_time":',','))
usage=max(call,cpu,total)
return usage
#Check if you reached 75% of the limit, if yes then back-off for 5 minutes (put this chunk in your 'for ad is ads' loop, every 100-200 iterations)
if (check_limit()>75):
print('75% Rate Limit Reached. Cooling Time 5 Minutes.')
logging.debug('75% Rate Limit Reached. Cooling Time 5 Minutes.')
time.sleep(300)
I'd just like to say
Thank you.
As Marks Andre said - you made my day!
The FB SDK documentation is exhaustive, but it completely lacks the practical implementation examples for day-to-day-tasks like this one. Bookmark is set - page will be revisited soon.
So the only thing I can actually contribute for fellow sufferers: it seems that with the newer facebook_business SDK you can simply completely replace "facebookads" in the import statements with "facebook_business".
I'd like to identify a method to attain the Worksheet ID within the URL for each of the worksheets within a Google Spreadsheet Workbook. For example, the worksheet id for 'sheet2' of this workbook is '1244369280' , since it's url is https://docs.google.com/spreadsheets/d/1yd8qTYjRns4_OT8PbsZzH0zajvzguKS79dq6j--hnTs/edit#gid=1244369280
One method I've found is to pull the XML of a Google Spreadsheet, since according to this question, the only way to get the Worksheet ID is to stream down the XML of a worksheet, but the example is in Javascript and I need to do this in Python
This is the Javascript Code that I'd like to execute in Python:
Dim worksheetFeed As WorksheetFeed
Dim query As WorksheetQuery
Dim worksheet As WorksheetEntry
Dim output As New MemoryStream
Dim xml As String
Dim gid As String = String.Empty
Try
_service = New Spreadsheets.SpreadsheetsService("ServiceName")
_service.setUserCredentials(UserId, Password)
query = New WorksheetQuery(feedUrl)
worksheetFeed = _service.Query(query)
worksheet = worksheetFeed.Entries(0)
' Save worksheet feed to memory stream so we can
' get the xml returned from the feed url and look for
' the gid. Gid allows us to download the specific worksheet tab
Using output
worksheet.SaveToXml(output)
End Using
xml = Encoding.ASCII.GetString(output.ToArray())
It seems that the best way to get the XML from a Google Spreadsheet is using Gdata, so I've downloaded GData and tried the Google Spreadsheet example with my credentials.
See below
#!/usr/bin/python
#
# Copyright (C) 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__author__ = 'api.laurabeth#gmail.com (Laura Beth Lincoln)'
try:
from xml.etree import ElementTree
except ImportError:
from elementtree import ElementTree
import gdata.spreadsheet.service
import gdata.service
import atom.service
import gdata.spreadsheet
import atom
import getopt
import sys
import string
class SimpleCRUD:
def __init__(self, email, password):
self.gd_client = gdata.spreadsheet.service.SpreadsheetsService()
self.gd_client.email = 'chris#curalate.com'
self.gd_client.password = 'jkjkdioerzumawya'
self.gd_client.source = 'Spreadsheets GData Sample'
self.gd_client.ProgrammaticLogin()
self.curr_key = ''
self.curr_wksht_id = ''
self.list_feed = None
def _PromptForSpreadsheet(self):
# Get the list of spreadsheets
feed = self.gd_client.GetSpreadsheetsFeed()
self._PrintFeed(feed)
input = raw_input('\nSelection: ')
id_parts = feed.entry[string.atoi(input)].id.text.split('/')
self.curr_key = id_parts[len(id_parts) - 1]
def _PromptForWorksheet(self):
# Get the list of worksheets
feed = self.gd_client.GetWorksheetsFeed(self.curr_key)
self._PrintFeed(feed)
input = raw_input('\nSelection: ')
id_parts = feed.entry[string.atoi(input)].id.text.split('/')
self.curr_wksht_id = id_parts[len(id_parts) - 1]
def _PromptForCellsAction(self):
print ('dump\n'
'update {row} {col} {input_value}\n'
'\n')
input = raw_input('Command: ')
command = input.split(' ', 1)
if command[0] == 'dump':
self._CellsGetAction()
elif command[0] == 'update':
parsed = command[1].split(' ', 2)
if len(parsed) == 3:
self._CellsUpdateAction(parsed[0], parsed[1], parsed[2])
else:
self._CellsUpdateAction(parsed[0], parsed[1], '')
else:
self._InvalidCommandError(input)
def _PromptForListAction(self):
print ('dump\n'
'insert {row_data} (example: insert label=content)\n'
'update {row_index} {row_data}\n'
'delete {row_index}\n'
'Note: No uppercase letters in column names!\n'
'\n')
input = raw_input('Command: ')
command = input.split(' ' , 1)
if command[0] == 'dump':
self._ListGetAction()
elif command[0] == 'insert':
self._ListInsertAction(command[1])
elif command[0] == 'update':
parsed = command[1].split(' ', 1)
self._ListUpdateAction(parsed[0], parsed[1])
elif command[0] == 'delete':
self._ListDeleteAction(command[1])
else:
self._InvalidCommandError(input)
def _CellsGetAction(self):
# Get the feed of cells
feed = self.gd_client.GetCellsFeed(self.curr_key, self.curr_wksht_id)
self._PrintFeed(feed)
def _CellsUpdateAction(self, row, col, inputValue):
entry = self.gd_client.UpdateCell(row=row, col=col, inputValue=inputValue,
key=self.curr_key, wksht_id=self.curr_wksht_id)
if isinstance(entry, gdata.spreadsheet.SpreadsheetsCell):
print 'Updated!'
def _ListGetAction(self):
# Get the list feed
self.list_feed = self.gd_client.GetListFeed(self.curr_key, self.curr_wksht_id)
self._PrintFeed(self.list_feed)
def _ListInsertAction(self, row_data):
entry = self.gd_client.InsertRow(self._StringToDictionary(row_data),
self.curr_key, self.curr_wksht_id)
if isinstance(entry, gdata.spreadsheet.SpreadsheetsList):
print 'Inserted!'
def _ListUpdateAction(self, index, row_data):
self.list_feed = self.gd_client.GetListFeed(self.curr_key, self.curr_wksht_id)
entry = self.gd_client.UpdateRow(
self.list_feed.entry[string.atoi(index)],
self._StringToDictionary(row_data))
if isinstance(entry, gdata.spreadsheet.SpreadsheetsList):
print 'Updated!'
def _ListDeleteAction(self, index):
self.list_feed = self.gd_client.GetListFeed(self.curr_key, self.curr_wksht_id)
self.gd_client.DeleteRow(self.list_feed.entry[string.atoi(index)])
print 'Deleted!'
def _StringToDictionary(self, row_data):
dict = {}
for param in row_data.split():
temp = param.split('=')
dict[temp[0]] = temp[1]
return dict
def _PrintFeed(self, feed):
for i, entry in enumerate(feed.entry):
if isinstance(feed, gdata.spreadsheet.SpreadsheetsCellsFeed):
print '%s %s\n' % (entry.title.text, entry.content.text)
elif isinstance(feed, gdata.spreadsheet.SpreadsheetsListFeed):
print '%s %s %s' % (i, entry.title.text, entry.content.text)
# Print this row's value for each column (the custom dictionary is
# built using the gsx: elements in the entry.)
print 'Contents:'
for key in entry.custom:
print ' %s: %s' % (key, entry.custom[key].text)
print '\n',
else:
print '%s %s\n' % (i, entry.title.text)
def _InvalidCommandError(self, input):
print 'Invalid input: %s\n' % (input)
def Run(self):
self._PromptForSpreadsheet()
self._PromptForWorksheet()
input = raw_input('cells or list? ')
if input == 'cells':
while True:
self._PromptForCellsAction()
elif input == 'list':
while True:
self._PromptForListAction()
def main():
# parse command line options
try:
opts, args = getopt.getopt(sys.argv[1:], "", ["user=", "pw="])
except getopt.error, msg:
print 'python spreadsheetExample.py --user [username] --pw [password] '
sys.exit(2)
user = 'fake#gmail.com'
pw = 'fakepassword'
key = ''
# Process options
for o, a in opts:
if o == "--user":
user = a
elif o == "--pw":
pw = a
if user == '' or pw == '':
print 'python spreadsheetExample.py --user [username] --pw [password] '
sys.exit(2)
sample = SimpleCRUD(user, pw)
sample.Run()
if __name__ == '__main__':
main()
However this returns the following error:
Traceback (most recent call last):
File "/Users/Chris/Desktop/gdata_test.py", line 200, in <module>
main()
File "/Users/Chris/Desktop/gdata_test.py", line 196, in main
sample.Run()
File "/Users/Chris/Desktop/gdata_test.py", line 162, in Run
self._PromptForSpreadsheet()
File "/Users/Chris/Desktop/gdata_test.py", line 49, in _PromptForSpreadsheet
feed = self.gd_client.GetSpreadsheetsFeed()
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/gdata/spreadsheet/service.py", line 99, in GetSpreadsheetsFeed
converter=gdata.spreadsheet.SpreadsheetsSpreadsheetsFeedFromString)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/gdata/service.py", line 1074, in Get
return converter(result_body)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/gdata/spreadsheet/__init__.py", line 395, in SpreadsheetsSpreadsheetsFeedFromString
xml_string)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/atom/__init__.py", line 93, in optional_warn_function
return f(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/atom/__init__.py", line 127, in CreateClassFromXMLString
tree = ElementTree.fromstring(xml_string)
File "<string>", line 125, in XML
cElementTree.ParseError: no element found: line 1, column 0
[Finished in 0.3s with exit code 1]
[shell_cmd: python -u "/Users/Chris/Desktop/gdata_test.py"]
[dir: /Users/Chris/Desktop]
[path: /usr/bin:/bin:/usr/sbin:/sbin]
I should also mention that I've been using Gspread as a method to interact with Google Spreadsheets, but when I run the below code, I get the gid, but I need to have the worksheet id.
gc = gspread.authorize(credentials)
sh = gc.open_by_url('google_spreadsheet_url')
sh.get_id_fields()
>> {'spreadsheet_id': '1BgCEn-3Nor7UxOEPwD-qv8qXe7CaveJBrn9_Lcpo4W4','worksheet_id': 'oqitk0d'}
See the self.gd_client.ProgrammaticLogin() call - this is causing the major problem since it uses the "ClientLogin" authorization method which was first deprecated and later removed on April 20, 2015.
I would actually look into the more fresh and actively developed gspread module instead.
Here is a, somewhat insane, example demonstrating how to extract the actual "gid" value for a given spreadsheet and worksheet name. Note that you would first need to generate the JSON file with the OAuth credentials (I'm assuming you've already done that).
The code (added comments that would hopefully help to understand it):
import urlparse
import xml.etree.ElementTree as ET
import gspread
from oauth2client.service_account import ServiceAccountCredentials
SPREADSHEET_NAME = 'My Test Spreadsheet'
WORKSHEET_NAME = "Sheet2"
PATH_TO_JSON_KEYFILE = '/path/to/json/key/file.json'
NAMESPACES = {'ns0': 'http://www.w3.org/2005/Atom'}
SCOPES = ['https://spreadsheets.google.com/feeds']
# log in
credentials = ServiceAccountCredentials.from_json_keyfile_name(PATH_TO_JSON_KEYFILE, SCOPES)
gss_client = gspread.authorize(credentials)
# open spreadsheet
gss = gss_client.open(SPREADSHEET_NAME)
# extract the full feed url
root = gss._feed_entry
full_feed_url = next(elm.attrib["href"] for elm in root.findall("ns0:link", namespaces=NAMESPACES) if "full" in elm.attrib["href"])
# get the feed and extract the gid value for a given sheet name
response = gss_client.session.get(full_feed_url)
root = ET.fromstring(response.content)
sheet_entry = next(elm for elm in root.findall("ns0:entry", namespaces=NAMESPACES)
if elm.find("ns0:title", namespaces=NAMESPACES).text == WORKSHEET_NAME)
link = next(elm.attrib["href"] for elm in sheet_entry.findall("ns0:link", namespaces=NAMESPACES)
if "gid=" in elm.attrib["href"])
# extract "gid" from URL
gid = urlparse.parse_qs(urlparse.urlparse(link).query)["gid"][0]
print(gid)
It also looks like there is a way to convert the worksheet ID to a gid value, see:
How to convert Google spreadsheet's worksheet string id to integer index (GID)?
Jan 2017
You can use the new google spreadsheet api v4. You could take look at pygsheets library which uses api v4.
import pygsheets
#authorize the pygsheets
gc = pygsheets.authorize()
#open the spreadsheet
sh = gc.open('my new ssheet')
# get the worksheet and its id
print sh.worksheet_by_title("my test sheet").id
this seems to work for me using gspread
given a spreadsheet's worksheet url named 'mysheet1' that looks like this:
https://docs.google.com/spreadsheets/d/xxxxxf435454xxkjkjk23232325/edit#gid=645031900
this could be use to retrieve the gid value (aka: worksheet id or sheetid)
ss_key = xxxxxf435454xxkjkjk23232325
wks_name = mysheet1
gc.open_by_key('xxxxxf435454xxkjkjk23232325').worksheet('mysheet1').id
result:
645031900
I have a CSV with keywords in one column and the number of impressions in a second column.
I'd like to provide the keywords in a url (while looping) and for the Google language api to return what type of language was the keyword in.
I have it working manually. If I enter (with the correct api key):
http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&key=myapikey&q=merde
I get:
{"responseData": {"language":"fr","isReliable":false,"confidence":6.213709E-4}, "responseDetails": null, "responseStatus": 200}
which is correct, 'merde' is French.
so far I have this code but I keep getting server unreachable errors:
import time
import csv
from operator import itemgetter
import sys
import fileinput
import urllib2
import json
E_OPERATION_ERROR = 1
E_INVALID_PARAMS = 2
#not working
def parse_result(result):
"""Parse a JSONP result string and return a list of terms"""
# Deserialize JSON to Python objects
result_object = json.loads(result)
#Get the rows in the table, then get the second column's value
# for each row
return row in result_object
#not working
def retrieve_terms(seedterm):
print(seedterm)
"""Retrieves and parses data and returns a list of terms"""
url_template = 'http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&key=myapikey&q=%(seed)s'
url = url_template % {"seed": seedterm}
try:
with urllib2.urlopen(url) as data:
data = perform_request(seedterm)
result = data.read()
except:
sys.stderr.write('%s\n' % 'Could not request data from server')
exit(E_OPERATION_ERROR)
#terms = parse_result(result)
#print terms
print result
def main(argv):
filename = argv[1]
csvfile = open(filename, 'r')
csvreader = csv.DictReader(csvfile)
rows = []
for row in csvreader:
rows.append(row)
sortedrows = sorted(rows, key=itemgetter('impressions'), reverse = True)
keys = sortedrows[0].keys()
for item in sortedrows:
retrieve_terms(item['keywords'])
try:
outputfile = open('Output_%s.csv' % (filename),'w')
except IOError:
print("The file is active in another program - close it first!")
sys.exit()
dict_writer = csv.DictWriter(outputfile, keys, lineterminator='\n')
dict_writer.writer.writerow(keys)
dict_writer.writerows(sortedrows)
outputfile.close()
print("File is Done!! Check your folder")
if __name__ == '__main__':
start_time = time.clock()
main(sys.argv)
print("\n")
print time.clock() - start_time, "seconds for script time"
Any idea how to finish the code so that it will work? Thank you!
Try to add referrer, userip as described in the docs:
An area to pay special attention to
relates to correctly identifying
yourself in your requests.
Applications MUST always include a
valid and accurate http referer header
in their requests. In addition, we
ask, but do not require, that each
request contains a valid API Key. By
providing a key, your application
provides us with a secondary
identification mechanism that is
useful should we need to contact you
in order to correct any problems. Read
more about the usefulness of having an
API key
Developers are also encouraged to make
use of the userip parameter (see
below) to supply the IP address of the
end-user on whose behalf you are
making the API request. Doing so will
help distinguish this legitimate
server-side traffic from traffic which
doesn't come from an end-user.
Here's an example based on the answer to the question "access to google with python":
#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import urllib, urllib2
from pprint import pprint
api_key, userip = None, None
query = {'q' : 'матрёшка'}
referrer = "https://stackoverflow.com/q/4309599/4279"
if userip:
query.update(userip=userip)
if api_key:
query.update(key=api_key)
url = 'http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&%s' %(
urllib.urlencode(query))
request = urllib2.Request(url, headers=dict(Referer=referrer))
json_data = json.load(urllib2.urlopen(request))
pprint(json_data['responseData'])
Output
{u'confidence': 0.070496580000000003, u'isReliable': False, u'language': u'ru'}
Another issue might be that seedterm is not properly quoted:
if isinstance(seedterm, unicode):
value = seedterm
else: # bytes
value = seedterm.decode(put_encoding_here)
url = 'http://...q=%s' % urllib.quote_plus(value.encode('utf-8'))