I use reddit API praw and psraw to extract comments from a subreddit, however, I got two errors today after running a few loops:
JSON object decoded error or empty -> ValueError, even I catch exception in my code, still doesnt work.
http request
example:
Traceback (most recent call last):
File "C:/Users/.../subreddit psraw.py", line 20, in <module>
for comment in submission.comments:
File "C:\Python27\lib\site-packages\praw\models\reddit\base.py", line 31, in __getattr__
self._fetch()
File "C:\Python27\lib\site-packages\praw\models\reddit\submission.py", line 142, in _fetch
'sort': self.comment_sort})
File "C:\Python27\lib\site-packages\praw\reddit.py", line 367, in get
data = self.request('GET', path, params=params)
File "C:\Python27\lib\site-packages\praw\reddit.py", line 451, in request
params=params)
File "C:\Python27\lib\site-packages\prawcore\sessions.py", line 174, in request
params=params, url=url)
File "C:\Python27\lib\site-packages\prawcore\sessions.py", line 108, in _request_with_retries
data, files, json, method, params, retries, url)
File "C:\Python27\lib\site-packages\prawcore\sessions.py", line 93, in _make_request
params=params)
File "C:\Python27\lib\site-packages\prawcore\rate_limit.py", line 33, in call
response = request_function(*args, **kwargs)
File "C:\Python27\lib\site-packages\prawcore\requestor.py", line 49, in request
raise RequestException(exc, args, kwargs)
prawcore.exceptions.RequestException: error with request
HTTPSConnectionPool(host='oauth.reddit.com', port=443): Read timed out. (read timeout=16.0)
Since a subreddit contains 10k+ comments, is there a way to solve such issue? is it because reddit website has some problems today?
My code:
import praw, datetime, os, psraw
reddit = praw.Reddit('bot1')
subreddit = reddit.subreddit('example')
for submission in psraw.submission_search(reddit, subreddit='example', limit=1000000):
try:
#get comments
for comment in submission.comments:
subid = submission.id
comid = comment.id
com_body = comment.body.encode('utf-8').replace("\n", " ")
com_date = datetime.datetime.utcfromtimestamp(comment.created_utc)
string_com = '"{0}", "{1}", "{2}"\n'
formatted_string_com = string_com.format(comid, com_body, com_date)
indexFile_comment = open('path' + subid + '.txt', 'a+')
indexFile_comment.write(formatted_string_com)
except ValueError:
print ("error")
pass
continue
except AttributeError:
print ("error")
pass
continue
Related
ok so im using python 3
i was able to get the data of the api using print(endpoint.json())
but i want to make it readable with pandas, so i can iterate through it easier.
this is the code (keep in mind i discarded my own api key and im using rapid api as a resource (specificly the movie database)
import requests
import json
import pandas
url = "https://movies-tvshows-data-imdb.p.rapidapi.com/"
querystring = {"type":"get-popular-movies","page":"1","year":"2020"}
headers = {
'x-rapidapi-host': "movies-tvshows-data-imdb.p.rapidapi.com",
'x-rapidapi-key': my key
}
response = requests.request("GET", url, headers=headers, params=querystring)
data=response.json()
df=pandas.read_json(data)
print(df)
i get this error
Traceback (most recent call last):
File "c:\Users\Home\Documents\studying\newproject\newproject.py", line 15, in <module>
df=pandas.read_json(data)
File "C:\Users\Home\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\util\_decorators.py", line 199, in wrapper
return func(*args, **kwargs)
File "C:\Users\Home\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\util\_decorators.py", line 296, in wrapper
return func(*args, **kwargs)
File "C:\Users\Home\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\json\_json.py", line 593, in read_json
filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
File "C:\Users\Home\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\io\common.py", line 243, in get_filepath_or_buffer
raise ValueError(msg)
ValueError: Invalid file path or buffer object type: <class 'dict'>
In your case data is a dict.
So, try with:
pandas.DataFrame.from_dict(data)
I have a config.ini file that looks like this
[REDDIT]
client_id = 'myclientid23jd934g'
client_secret = 'myclientsecretjf30gj5g'
password = 'mypassword'
user_agent = 'myuseragent'
username = 'myusername'
When I try to use reddit's API praw like this:
import configparser
import praw
class redditImageScraper:
def __init__(self, sub, limit):
config = configparser.ConfigParser()
config.read('config.ini')
self.sub = sub
self.limit = limit
self.reddit = praw.Reddit(client_id=config.get('REDDIT','client_id'),
client_secret=config.get('REDDIT','client_secret'),
password=config.get('REDDIT','password'),
user_agent=config.get('REDDIT','user_agent'),
username=config.get('REDDIT','username'))
def get_content(self):
submissions = self.reddit.subreddit(self.sub).hot(limit=self.limit)
for submission in submissions:
print(submission.id)
def main():
scraper = redditImageScraper('aww', 25)
scraper.get_content()
if __name__ == '__main__':
main()
I get this traceback
Traceback (most recent call last):
File "config.py", line 30, in <module>
main()
File "config.py", line 27, in main
scraper.get_content()
File "config.py", line 22, in get_content
for submission in submissions:
File "C:\Users\Evan\Anaconda3\lib\site-packages\praw\models\listing\generator.py", line 61, in __next__
self._next_batch()
File "C:\Users\Evan\Anaconda3\lib\site-packages\praw\models\listing\generator.py", line 71, in _next_batch
self._listing = self._reddit.get(self.url, params=self.params)
File "C:\Users\Evan\Anaconda3\lib\site-packages\praw\reddit.py", line 454, in get
data = self.request("GET", path, params=params)
File "C:\Users\Evan\Anaconda3\lib\site-packages\praw\reddit.py", line 627, in request
method, path, data=data, files=files, params=params
File "C:\Users\Evan\Anaconda3\lib\site-packages\prawcore\sessions.py", line 185, in request
params=params, url=url)
File "C:\Users\Evan\Anaconda3\lib\site-packages\prawcore\sessions.py", line 116, in _request_with_retries
data, files, json, method, params, retries, url)
File "C:\Users\Evan\Anaconda3\lib\site-packages\prawcore\sessions.py", line 101, in _make_request
params=params)
File "C:\Users\Evan\Anaconda3\lib\site-packages\prawcore\rate_limit.py", line 35, in call
kwargs['headers'] = set_header_callback()
File "C:\Users\Evan\Anaconda3\lib\site-packages\prawcore\sessions.py", line 145, in _set_header_callback
self._authorizer.refresh()
File "C:\Users\Evan\Anaconda3\lib\site-packages\prawcore\auth.py", line 328, in refresh
password=self._password)
File "C:\Users\Evan\Anaconda3\lib\site-packages\prawcore\auth.py", line 138, in _request_token
response = self._authenticator._post(url, **data)
File "C:\Users\Evan\Anaconda3\lib\site-packages\prawcore\auth.py", line 31, in _post
raise ResponseException(response)
prawcore.exceptions.ResponseException: received 401 HTTP response
However when I manually insert the credentials, my code runs exactly as expected. Also, if I run the line
print(config.get('REDDIT', 'client_id'))
I get the output 'myclientid23jd934g' as expected.
Is there some reason that praw won't allow me to pass my credentials using configparser?
Double check what your inputs to praw.Reddit are:
kwargs = dict(client_id=config.get('REDDIT','client_id'),
client_secret=config.get('REDDIT','client_secret'),
password=config.get('REDDIT','password'),
user_agent=config.get('REDDIT','user_agent'),
username=config.get('REDDIT','username')))
print(kwargs)
praw.Reddit(**kwargs)
You're overcomplicating configuration here — PRAW will take care of this for you.
If you rename config.ini to praw.ini, you can replace your whole initialization with just
self.reddit = praw.Reddit('REDDIT')
This is because PRAW will look for a praw.ini file and parse it for you. If you want to give the section a more descriptive name, make sure to update it in the praw.ini as well as in the single parameter passed to Reddit (which specifies the section of the file to use).
See https://praw.readthedocs.io/en/latest/getting_started/configuration/prawini.html.
As this page notes, values like username and password should not have quotation marks around them. For example,
password=mypassword
is correct, but
password="mypassword"
is incorrect.
Hi I am trying to a hit an API using requests module of python. The Api has to be hit 20000 times as the number of pages are around 20000. In every hit the data comes around 10 mb. By the end of the process it creates a json file of around 100gb. Here is the code I have written
with open('file.json','wb',buffering=100*1048567) as f:
while(next_page_cursor != ""):
with request.get(url,headers=headers) as response:
json_response = json.loads(response.content.decode('utf-8'))
"""
json response looks something like this
{
content:[{},{},{}........50 dictionaries]
next_page_cursor : "abcd"
}
"""
next_page_cursor = json_response['next_page_cursor']
for data in json_response['content']:
f.write((json.dumps(data) + "\n").encode())
But after running successfully for few pages the code fails giving the below error:
Traceback (most recent call last):
File "<command-1206920060120926>", line 65, in <module>
with requests.get(data_url, headers = headers) as response:
File "/databricks/python/lib/python3.7/site-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/databricks/python/lib/python3.7/site-packages/requests/sessions.py", line 686, in send
r.content
File "/databricks/python/lib/python3.7/site-packages/requests/models.py", line 828, in content
self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b''
File "/databricks/python/lib/python3.7/site-packages/requests/models.py", line 753, in generate
raise ChunkedEncodingError(e)
requests.exceptions.ChunkedEncodingError: ('Connection broken: OSError("(104, \'ECONNRESET\')")', OSError("(104, 'ECONNRESET')"))
you need to use response.iter_content
https://2.python-requests.org/en/master/api/#requests.Response.iter_content
So I'm trying to program a Reddit reply bot to simply moderating and i got pretty far into it but then when testing the code python gave me a long error that I don't understand. I haven't tried fixing it much because my skill on python is very limited so I have no idea what to do.
import praw
userAgent = 'Recomend Bot 0.1'
cID = 'rz8Gh2k8RS-NRA'
cSC= '9FR8Balfkd0OcgiKVosMSqAP2YM'
userN = ''
userP =''
numFound = 0
reddit = praw.Reddit(user_agent=userAgent, client_id=cID, client_secret=cSC, username=userN, password=userP)
subreddit = reddit.subreddit('empfehlen_testen')
bot_phrase = 'Test Reply 177013'
keywords = {'test', 'Test', 'recomendation'}
for submission in subreddit.new(limit=10):
n_title = submission.title.lower()
for i in keywords:
if i in n_title:
numFound = numFound + 1
print('Bot replying to: ')
print("Title: ", submission.title)
print("Text: ", submission.selftext)
print("Score: ", submission.score)
print("---------------------------------")
print('Bot saying: ', bot_phrase)
print()
submission.reply(bot_phrase)
if numFound == 0:
print()
print("Sorry, didn't find any posts with those keywords, try again!")
#credit for code goes to Phrynk for code all I did was get it to work on my coumputer
That got working somehow and then this error message popped up
Traceback (most recent call last):
File "C:\Users\Dillon\Desktop\RedditBot\reddit_bot.py", line 26, in <module>
for submission in subreddit.new(limit=10): #this views the top 10 posts in that subbreddit
File "C:\Users\Dillon\Desktop\RedditBot\praw\models\listing\generator.py", line 52, in __next__
self._next_batch()
File "C:\Users\Dillon\Desktop\RedditBot\praw\models\listing\generator.py", line 62, in _next_batch
self._listing = self._reddit.get(self.url, params=self.params)
File "C:\Users\Dillon\Desktop\RedditBot\praw\reddit.py", line 446, in get
data = self.request("GET", path, params=params)
File "C:\Users\Dillon\Desktop\RedditBot\praw\reddit.py", line 581, in request
method, path, data=data, files=files, params=params
File "C:\Users\Dillon\AppData\Local\Programs\Python\Python37-32\lib\site-packages\prawcore\sessions.py", line 185, in request
params=params, url=url)
File "C:\Users\Dillon\AppData\Local\Programs\Python\Python37-32\lib\site-packages\prawcore\sessions.py", line 116, in _request_with_retries
data, files, json, method, params, retries, url)
File "C:\Users\Dillon\AppData\Local\Programs\Python\Python37-32\lib\site-packages\prawcore\sessions.py", line 101, in _make_request
params=params)
File "C:\Users\Dillon\AppData\Local\Programs\Python\Python37-32\lib\site-packages\prawcore\rate_limit.py", line 35, in call
kwargs['headers'] = set_header_callback()
File "C:\Users\Dillon\AppData\Local\Programs\Python\Python37-32\lib\site-packages\prawcore\sessions.py", line 145, in _set_header_callback
self._authorizer.refresh()
File "C:\Users\Dillon\AppData\Local\Programs\Python\Python37-32\lib\site-packages\prawcore\auth.py", line 328, in refresh
password=self._password)
File "C:\Users\Dillon\AppData\Local\Programs\Python\Python37-32\lib\site-packages\prawcore\auth.py", line 142, in _request_token
payload.get('error_description'))
prawcore.exceptions.OAuthException: invalid_grant error processing request
prawcore.exceptions.OAuthException: invalid_grant error processing request
means there was a problem authenticating the user.
Remember that the username is your reddit's account name, not the bot's name.
macOS 10.12.3 python 2.7.13 requests 2.13.0
I use requests package to send post request.This request need to login before post data.So I use request.Session() and load a logined cookie.
Then I use this session to send post data in cycle mode.
It is no error that I used to run this code in Windows and Linux.
Simple Code:
s = request.Session()
s.cookies = cookieslib.LWPCookieJar('cookise')
s.cookies.load(ignore_discard=True)
for user_id in range(100,200):
url = 'http://xxxx'
data = { 'user': user_id, 'content': '123'}
r = s.post(url, data)
...
But the program frequently (about every interval) crash, the error isAttributeError: 'module' object has no attribute 'kqueue'
Traceback (most recent call last):
File "/Users/gasxia/Dev/Projects/TgbookSpider/kfz_send_msg.py", line 90, in send_msg
r = requests.post(url, data) # catch error if user isn't exist
File "/usr/local/lib/python2.7/site-packages/requests/sessions.py", line 535, in post
return self.request('POST', url, data=data, json=json, **kwargs)
File "/usr/local/lib/python2.7/site-packages/requests/sessions.py", line 488, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python2.7/site-packages/requests/sessions.py", line 609, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python2.7/site-packages/requests/adapters.py", line 423, in send
timeout=timeout
File "/usr/local/lib/python2.7/site-packages/requests/packages/urllib3/connectionpool.py", line 588, in urlopen
conn = self._get_conn(timeout=pool_timeout)
File "/usr/local/lib/python2.7/site-packages/requests/packages/urllib3/connectionpool.py", line 241, in _get_conn
if conn and is_connection_dropped(conn):
File "/usr/local/lib/python2.7/site-packages/requests/packages/urllib3/util/connection.py", line 27, in is_connection_dropped
return bool(wait_for_read(sock, timeout=0.0))
File "/usr/local/lib/python2.7/site-packages/requests/packages/urllib3/util/wait.py", line 33, in wait_for_read
return _wait_for_io_events(socks, EVENT_READ, timeout)
File "/usr/local/lib/python2.7/site-packages/requests/packages/urllib3/util/wait.py", line 22, in _wait_for_io_events
with DefaultSelector() as selector:
File "/usr/local/lib/python2.7/site-packages/requests/packages/urllib3/util/selectors.py", line 431, in __init__
self._kqueue = select.kqueue()
AttributeError: 'module' object has no attribute 'kqueue'
This looks like a problem that commonly arises if you're using something like eventlet or gevent, both of which monkeypatch the select module. If you're using those to achieve asynchrony, you will need to ensure that those monkeypatches are applied before importing requests. This is a known bug, being tracked in this issue.