Scrape facebook AttributeError - python

I am beginner for Python,
How I can solve
AttributeError: module 'urllib' has no attribute 'Request'
As I view other post, still can't understand how solve the problem
Here the screen capture of the error
And this is the code (I refer from https://github.com/minimaxir/facebook-page-post-scraper/blob/master/get_fb_posts_fb_page.py)
import urllib.request
import json, datetime, csv, time
app_id = "xxx"
app_secret = "xxx" # DO NOT SHARE WITH ANYONE!
access_token = "xxx"
page_id = 'xxx'
def testFacebookPageData(page_id, access_token):
# construct the URL string
base = "https://graph.facebook.com/v2.4"
node = "/" + page_id +'/feed'
parameters = "/?access_token=%s" % access_token
url = base + node + parameters
# retrieve data
response = urllib.request.urlopen(url)
data = json.loads(response.read().decode('utf-8'))
print (data)
def request_until_succeed(url):
req = urllib.request.urlopen(url)
success = False
while success is False:
try:
response = urllib.urlopen(req)
if response.getcode() == 200:
success = True
except Exception as e:
print (e)
time.sleep(5)
print (url, datetime.datetime.now())
return response.read()
def getFacebookPageFeedData(page_id, access_token, num_statuses):
# construct the URL string
base = "https://graph.facebook.com"
node = "/" + page_id + "/feed"
parameters = "/?fields=message,link,created_time,type,name,id,likes.limit(1).summary(true),comments.limit(1).summary(true),shares&limit=%s&access_token=%s" % (num_statuses, access_token) # changed
url = base + node + parameters
# retrieve data
data = json.loads(request_until_succeed(url))
return data
def processFacebookPageFeedStatus(status):
# The status is now a Python dictionary, so for top-level items,
# we can simply call the key.
# Additionally, some items may not always exist,
# so must check for existence first
status_id = status['id']
status_message = '' if 'message' not in status.keys() else status['message'].encode('utf-8')
link_name = '' if 'name' not in status.keys() else status['name'].encode('utf-8')
status_type = status['type']
status_link = '' if 'link' not in status.keys() else status['link']
# Time needs special care since a) it's in UTC and
# b) it's not easy to use in statistical programs.
status_published = datetime.datetime.strptime(status['created_time'],'%Y-%m-%dT%H:%M:%S+0000')
status_published = status_published + datetime.timedelta(hours=-5) # EST
status_published = status_published.strftime('%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs
# Nested items require chaining dictionary keys.
num_likes = 0 if 'likes' not in status.keys() else status['likes']['summary']['total_count']
num_comments = 0 if 'comments' not in status.keys() else status['comments']['summary']['total_count']
num_shares = 0 if 'shares' not in status.keys() else status['shares']['count']
# return a tuple of all processed data
return (status_id, status_message, link_name, status_type, status_link,
status_published, num_likes, num_comments, num_shares)
def scrapeFacebookPageFeedStatus(page_id, access_token):
with open('%s_facebook_statuses.csv' % page_id, 'w') as file:
w = csv.writer(file)
w.writerow(["status_id", "status_message", "link_name", "status_type", "status_link",
"status_published", "num_likes", "num_comments", "num_shares"])
has_next_page = True
num_processed = 0 # keep a count on how many we've processed
scrape_starttime = datetime.datetime.now()
print (page_id, scrape_starttime)
statuses = getFacebookPageFeedData(page_id, access_token, 100)
while has_next_page:
for status in statuses['data']:
w.writerow(processFacebookPageFeedStatus(status))
# output progress occasionally to make sure code is not stalling
num_processed += 1
if num_processed % 1000 == 0:
print (num_processed, datetime.datetime.now())
# if there is no next page, we're done.
if 'paging' in statuses.keys():
statuses = json.loads(request_until_succeed(statuses['paging']['next']))
else:
has_next_page = False
print (num_processed, datetime.datetime.now() - scrape_starttime)
if __name__ == '__main__':
scrapeFacebookPageFeedStatus(page_id, access_token)

There is no urllib.Request() in Python 3 - there is urllib.request.Request().
EDIT: you have url = urllib.Request(url) in error message but I don't see this line in your code - maybe you run wrong file.

Related

I'm getting a "ListError: list index out of range" in my "clean_json_response" function

I'm using a medium API to get a some information but after some API calls the python script ended with this error:
IndexError: list index out of range
Here is my Python code:
def get_post_responses(posts):
#start = time.time()
count = 0
print('Retrieving the post responses...')
responses = []
for post in posts:
url = MEDIUM + '/_/api/posts/' + post + '/responses'
count = count + 1
print("number of times api called",count)
response = requests.get(url)
response_dict = clean_json_response(response)
responses += response_dict['payload']['value']
#end = time.time()
#four = end - start
#global time_cal
#time_cal.append(four)
return responses
def check_if_high_recommends(response, recommend_min):
if response['virtuals']['recommends'] >= recommend_min:
return True
def check_if_recent(response):
limit_date = datetime.now() - timedelta(days=360)
creation_epoch_time = response['createdAt'] / 1000
creation_date = datetime.fromtimestamp(creation_epoch_time)
if creation_date >= limit_date:
return True
It needs to work for more then 10000 followers for a user.
i got an ans for my question...
just i need to use try catch exception ...
response_dict = clean_json_response(response)
try:
responses += response_dict['payload']['value']
catch:
continue

Looping through Rocket.Chat API

Python 3.7.2
PyCharm
I'm fairly new to Python, and API interaction; I'm trying to loop through the API for Rocket Chat, specifically pulling user email address's out.
Unlike nearly every example I can find, Rocket Chat doesn't use any kind of construct like "Next" - it uses count and offset, which I had actually
though might make this easier.
I have managed to get the first part of this working,
looping over the JSON and getting the emails. What I need to do, is loop through the API endpoints - which is what I have ran into some issue with.
I have looked at this answer Unable to loop through paged API responses with Python
as it seemed to be pretty close to what I want, but I couldn't get it to work correctly.
The code below, is what I have right now; obviously this isn't doing any looping through the API endpoint just yet, its just looping over the returned json.
import os
import csv
import requests
import json
url = "https://rocketchat.internal.net"
login = "/api/v1/login"
rocketchatusers = "/api/v1/users.list"
#offset = "?count=500&offset=0"
class API:
def userlist(self, userid, token):
headers = {'X-Auth-Token': token, 'X-User-Id': userid}
rocketusers = requests.get(url + rocketchatusers, headers=headers, verify=False)
print('Status Code:' + str(rocketusers.status_code))
print('Content Type:' + rocketusers.headers['content-type'])
userlist = json.loads(rocketusers.text)
x = 0
y = 0
emails = open('emails', 'w')
while y == 0:
try:
for i in userlist:
print(userlist['users'][x]['emails'][0]['address'], file=emails)
# print(userlist['users'][x]['emails'][0]['address'])
x += 1
except KeyError:
print("This user has no email address", file=emails)
x += 1
except IndexError:
print("End of List")
emails.close()
y += 1
What I have tried and what I would like to do, is something along the lines of an easy FOR loop. There are realistically probably a lot of ways to do what I'm after, I just don't know them.
Something like this:
import os
import csv
import requests
import json
url = "https://rocketchat.internal.net"
login = "/api/v1/login"
rocketchatusers = "/api/v1/users.list"
offset = "?count=500&offset="+p
p = 0
class API:
def userlist(self, userid, token):
headers = {'X-Auth-Token': token, 'X-User-Id': userid}
rocketusers = requests.get(url + rocketchatusers+offset, headers=headers, verify=False)
for r in rocketusers:
print('Status Code:' + str(rocketusers.status_code))
print('Content Type:' + rocketusers.headers['content-type'])
userlist = json.loads(rocketusers.text)
x = 0
y = 0
emails = open('emails', 'w')
while y == 0:
try:
for i in userlist:
print(userlist['users'][x]['emails'][0]['address'], file=emails)
# print(userlist['users'][x]['emails'][0]['address'])
x += 1
except KeyError:
print("This user has no email address", file=emails)
x += 1
except IndexError:
print("End of List")
emails.close()
y += 1
p += 500
Now, obviously this doesn't work, or I'd not be posting, but the why it doesn't work is the issue.
The error that get report is that I can't concatenate an INT, when a STR is expected. Ok, fine. When I attempt something like:
str(p = 0)
I get a type error. I have tried a lot of other things as well, many of them simply silly, such as p = [], p = {} and other more radical idea's as well.
The URL, if not all variables and concatenated would look something like this:
https://rocketchat.internal.net/api/v1/users.list?count=500&offset=0
https://rocketchat.internal.net/api/v1/users.list?count=500&offset=500
https://rocketchat.internal.net/api/v1/users.list?count=500&offset=1000
https://rocketchat.internal.net/api/v1/users.list?count=500&offset=1500
I feel like there is something really simple that I'm missing. I'm reasonably sure that the answer is in the response to the post I listed, but I couldn't get it to work.
So, after asking around, I found out that I had been on the right path to figuring this issue out, I had just tried in the wrong place. Here's what I ended up with:
def userlist(self, userid, token):
p = 0
while p <= 7500:
if not os.path.exists('./emails'):
headers = {'X-Auth-Token': token, 'X-User-Id': userid}
rocketusers = requests.get(url + rocketchatusers + offset + str(p), headers=headers, verify=False)
print('Status Code:' + str(rocketusers.status_code))
print('Content Type:' + rocketusers.headers['content-type'])
print('Creating the file "emails" to use to compare against list of regulated users.')
print(url + rocketchatusers + offset + str(p))
userlist = json.loads(rocketusers.text)
x = 0
y = 0
emails = open('emails', 'a+')
while y == 0:
try:
for i in userlist:
#print(userlist['users'][x]['emails'][0]['address'], file=emails)
print(userlist['users'][x]['ldap'], file=emails)
print(userlist['users'][x]['username'], file=emails)
x += 1
except KeyError:
x += 1
except IndexError:
print("End of List")
emails.close()
p += 50
y += 1
else:
headers = {'X-Auth-Token': token, 'X-User-Id': userid}
rocketusers = requests.get(url + rocketchatusers + offset + str(p), headers=headers, verify=False)
print('Status Code:' + str(rocketusers.status_code))
print('Content Type:' + rocketusers.headers['content-type'])
print('Populating file "emails" - this takes a few moments, please be patient.')
print(url + rocketchatusers + offset + str(p))
userlist = json.loads(rocketusers.text)
x = 0
z = 0
emails = open('emails', 'a+')
while z == 0:
try:
for i in userlist:
#print(userlist['users'][x]['emails'][0]['address'], file=emails)
print(userlist['users'][x]['ldap'], file=emails)
print(userlist['users'][x]['username'], file=emails)
x += 1
except KeyError:
x += 1
except IndexError:
print("End of List")
emails.close()
p += 50
z += 1
This is still a work in progress, unfortunately, this isn't an avenue for collaboration, later I may post this to GitHub so that others can see it.

Export users with criteria - Python script

The script I have is exporting all users but I am looking to export users who have a type = xyz. There are two types of users in the directory such as type a and type b and i only want to export users who have type attribute matches b.
Please help me to add a clause/statement in the script so it should only pull users with Type "B" and ignore other users with ant other type.
import requests
import json
import re
import sys
import csv
orgName = ""
apiKey = ""
api_token = "SSWS "+ apiKey
headers = {'Accept':'application/json','Content-Type':'application/json','Authorization':api_token}
def GetPaginatedResponse(url):
response = requests.request("GET", url, headers=headers)
returnResponseList = []
responseJSON = json.dumps(response.json())
responseList = json.loads(responseJSON)
returnResponseList = returnResponseList + responseList
if "errorCode" in responseJSON:
print "\nYou encountered following Error: \n"
print responseJSON
print "\n"
return "Error"
else:
headerLink= response.headers["Link"]
while str(headerLink).find("rel=\"next\"") > -1:
linkItems = str(headerLink).split(",")
nextCursorLink = ""
for link in linkItems:
if str(link).find("rel=\"next\"") > -1:
nextCursorLink = str(link)
nextLink = str(nextCursorLink.split(";")[0]).strip()
nextLink = nextLink[1:]
nextLink = nextLink[:-1]
url = nextLink
response = requests.request("GET", url, headers=headers)
responseJSON = json.dumps(response.json())
responseList = json.loads(responseJSON)
returnResponseList = returnResponseList + responseList
headerLink= response.headers["Link"]
returnJSON = json.dumps(returnResponseList)
return returnResponseList
def DownloadSFUsers():
url = "https://"+orgName+".com/api/v1/users"
responseJSON = GetPaginatedResponse(url)
if responseJSON != "Error":
userFile = open("Only-Okta_Users.csv", "wb")
writer = csv.writer(userFile)
writer.writerow(["login","type"]).encode('utf-8')
for user in responseJSON:
login = user[u"profile"][u"login"]
type = user[u"credentials"][u"provider"][u"type"]
row = ("+login+","+type).encode('utf-8')
writer.writerow([login,type])
if __name__ == "__main__":
DownloadSFUsers()
Wrap your statement that writes a user to the csv file in an if statement that tests for the correct type.

Python avoid item = None in a request of multiple items

I'm doing a loop for multiple requests on a web database, each time it takes one geneId to request it to the database. If the geneId is referenced I can use the data I get for another request on a second database. But if the geneId is not referenced, it gives me nothing back and it destroys my functions.
So I determined a None possibility, but in this case I'm getting a :
TypeError: 'NoneType' object is not iterable.
here is a part of my code with the None :
def getNewID(oneGeneIdAtOnce):
url = "http:/blablabla"
try :
cookieJar = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookieJar))
opener.addheaders = getHeaders()
resp = opener.open(url)
data = urllib.parse.urlencode(getPost(oneGeneIdAtOnce))
data = data.encode("UTF-8")
resp = opener.open(url, data)
respData = resp.read()
jobValue = getJobValueFromCookie(cookieJar)
downloadUrl = getUrlFromJobValue(jobValue)
resp = opener.open(downloadUrl)
respData = resp.read()
string = respData.decode("UTF-8")
if not string:
return None
l = string.split("\n")
lFinal = l[1].split("\t")
return lFinal[1]
except HTTPError as httpError:
print("HERE " + str(httpError))
except TypeError:
None
def searchGoFromDico(dictionary):
dicoGoForEachGroup = {}
for groupName in dico:
taxonAndGene = dico[groupName]
listeAllGoForOneGroup = []
for taxon in taxonAndGene:
geneIds = taxonAndGene[taxon]
for geneId in geneIds:
if geneId is not None:
listeGo = getGoID(getUniprotID(geneId))
listeAllGoForOneGroup.extend(listeGo)
dicoGoForEachGroup[groupName] = listeAllGoForOneGroup
return dicoGoForEachGroup
Any idea to allow my functions to work properly even if one of the geneId is None for the database ? Thank you for your upcoming answers.
You can use a try/except block around the code that's causing the issue. If the list is empty, run the next iteration of the outer for loop (untested):
def searchGoFromDico(dictionary):
dicoGoForEachGroup = {}
for groupName in dico:
taxonAndGene = dico[groupName]
listeAllGoForOneGroup = []
for taxon in taxonAndGene:
geneIds = taxonAndGene[taxon]
try:
for geneId in geneIds:
if geneId is not None:
listeGo = getGoID(getUniprotID(geneId))
listeAllGoForOneGroup.extend(listeGo)
except TypeError:
continue
dicoGoForEachGroup[groupName] = listeAllGoForOneGroup
return dicoGoForEachGroup

Youtube API Handling Deleted video error

I have written code to get playlist and the video lists within them in different text files:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
YouTube Playlist Extrator.
A tool to extract playlists from YouTube API which in todays YouTube page format is very difficult to extract.
It also extracts video lists per playlist and hence takes bit longer to run for long playlists.
"""
#from profiler import Profiler
from xml.dom.minidom import parseString
import os
try:
import urllib.request as urlLibReq
PY3 = True
except:
import urllib as urlLibReq
PY3 = False
def getInput():
if PY3:
return input("Enter username of YouTube channel: ")
elif not PY3:
return raw_input("Enter username of YouTube channel: ")
def xmlParser(url):
page = urlLibReq.urlopen(url)
text = page.read().decode("utf8")
return parseString(text)
def extractplaylist(userId):
url = "https://gdata.youtube.com/feeds/api/users/"+ userId +"/playlists?v=2"
dom = xmlParser(url)
total = int(dom.getElementsByTagName("openSearch:totalResults")[0].firstChild.nodeValue)
startIndex, listEntry = 1 , []
while startIndex <= total:
url_new = url + "&max-results=50&start-index="+ str(startIndex)
dom = xmlParser(url_new)
entry = dom.getElementsByTagName("entry")
for node in entry:
id_data = node.getElementsByTagName("id")[0].firstChild.nodeValue
id_split = id_data.split(':')
playlist_id = id_split[5]
playlist_title = node.getElementsByTagName("title")[0].firstChild.nodeValue
extractvideolist(userId, playlist_id, playlist_title)
listEntry.append(str(playlist_title))
startIndex += 1
listEntry.sort()
writer = open(userId+"_playlist.txt","w")
writer.write("\r\n".join(map(str, listEntry)))
writer.close()
def extractvideolist(userId, playlist_id, playlist_title):
url = "http://gdata.youtube.com/feeds/api/playlists/"+ playlist_id +"?v=2"
dom = xmlParser(url)
total = int(dom.getElementsByTagName("openSearch:totalResults")[0].firstChild.nodeValue)
startIndex, listEntry = 1 , []
while startIndex <= total:
url_new = url + "&max-results=50&start-index="+ str(startIndex)
dom = xmlParser(url_new)
entry = dom.getElementsByTagName("entry")
for node in entry:
video_title = node.getElementsByTagName("title")[0].firstChild.nodeValue
listEntry.append(str(video_title))
startIndex += 1
playlist_title = playlist_title.replace("'","\'")
writer = open(playlist_title+"_videolist.txt","w")
writer.write("\r\n".join(map(str, listEntry)))
writer.close()
print("written", playlist_title)
try: os.mkdir(userId)
except: pass
os.system('mv "'+ playlist_title +'_videolist.txt" '+ userId)
if __name__ == "__main__":
name = getInput()
extractplaylist(name)
#Profiler.report()
The code fails when there is a deleted video in the playlist. How do I deal with such a thing?
Try adding an else clause to your for loop to break out of the while loop when the for loop ends.
while startIndex <= total:
url_new = url + "&max-results=50&start-index="+ str(startIndex)
dom = xmlParser(url_new)
entry = dom.getElementsByTagName("entry")
for node in entry:
id_data = node.getElementsByTagName("id")[0].firstChild.nodeValue
id_split = id_data.split(':')
playlist_id = id_split[5]
playlist_title = node.getElementsByTagName("title")[0].firstChild.nodeValue
extractvideolist(userId, playlist_id, playlist_title)
listEntry.append(str(playlist_title))
startIndex += 1
else:
break

Categories

Resources