Have a look at the following code:
q = '#MentionSomeoneImportantForYou'
count = 100
search_results = twitter_api.search.tweets(q=q, count=count)
#twitter_api is predefined and is working fine.
statuses = search_results['statuses']
for _ in range(5):
print "Length of statuses", len(statuses)
try:
next_results = search_results['search_metadata']['next_results']
except KeyError, e: # No more results when next_results doesn't exist
break
kwargs = dict([ kv.split('=') for kv in next_results[1:].split("&") ])
The last code throws an error that 'next_results' is not defined.
Where did I go wrong on this?
I really don't get why you have to
next_results = search_results['search_metadata']['next_results']
while this line will return the same result in 5 times ?
Anw, "next_results" is not defined means that the line above hasn't been reached even 1 time.
How about
print search_results['search_metadata']
to see exactly how was the API's response ?
This code works perfectly!
# Import unquote to prevent url encoding errors in next_results
from urllib.parse import unquote
q='#Ethiopia'
count = 100
# See https://dev.twitter.com/docs/api/1.1/get/search/tweets
search_results = twitter_api.search.tweets(q=q, count=count)
statuses = search_results['statuses']
# Iterate through 5 more batches of results by following the cursor
print(search_results['search_metadata'])
for _ in range(5):
print("Length of statuses", len(statuses))
try:
#print(search_results['search_metadata'])
next_results = search_results['search_metadata']['next_results']
except KeyError: # No more results when next_results doesn't exist
break
# Create a dictionary from next_results, which has the following form:
# ?max_id=313519052523986943&q=NCAA&include_entities=1
kwargs = dict([ kv.split('=') for kv in unquote(next_results[1:]).split("&") ])
search_results = twitter_api.search.tweets(**kwargs)
statuses += search_results['statuses']
# Show one sample search result by slicing the list...
print(json.dumps(statuses[0], indent=1))
Related
Can't append seperate values from json data to lists. When trying to index them, getting this kind of error : 'TypeError: 'int' object is not subscriptable'
Without showing index, its just appends ALL of the data, which i dont want.
In this part i'am getting data:
import requests
import json
protein = []
fat = []
calories = []
sugar = []
def scrape_all_fruits():
data_list = []
try:
for ID in range(1, 10):
url = f'https://www.fruityvice.com/api/fruit/{ID}'
response = requests.get(url)
data = response.json()
data_list.append(data)
except:
pass
return data_list
In this part iam trying to append data and getting error i've mentioned before.
alist = json.dumps(scrape_all_fruits())
jsonSTr = json.loads(alist)
for i in jsonSTr:
try:
for value in i['nutritions'].values():
fat.append(value['fat'])
except KeyError:
pass
print(fat)
you iterate trough the values of nutritions. So it's not possible that there is a "fat" key. And why you iterate trough it? I mean theres no reason, just take the Key.
alist = json.dumps(scrape_all_fruits())
json_str = json.loads(alist)
for i in json_str:
try:
print(i['nutritions'])
fat.append(i['nutritions']['fat'])
except KeyError:
pass
print(fat)
This works. Tested on Python 3.8
I am attempting to implement selenium and list.
This is my code:
from selenium import webdriver
pool_size = 1
def worker(item1):
try:
#driver = webdriver.Chrome()
print("Processing:", item1)
#driver.get(item1)
except Exception as e:
print('error | ' + str(e))
pool = Pool(pool_size)
items = open('file.txt', 'r').read().splitlines()
for item in items:
print(item)
pool.apply_async(worker, str(item))
pool.close()
pool.join()
If change items = open('file.txt', 'r').read().splitlines() to items = ['1','2'] it works else, it will just return nothing and end the script. Same thing for the file, if I input 1\n2
No idea why this is happening, super weird. Thanks in advance.
I need help to find email adress on website. After some research, I found the solution but it's so long, I have a lot of datas (more than 90 000) and my code never stop.
Do you know tips to optimize/accelerate my code ?
This is my list of the URL:
http://etsgaidonsarl.site-solocal.com/
http://fr-fr.facebook.com/people/
http://ipm-mondia.com/
http://lfgenieclimatique.fr/
http://vpcinstallation.site-solocal.com
http://www.cavifroid.fr/
http://www.clim-monnier.com/
http://www.climacool.net/
I use 2 loops. The first is to find all pages of a website because the email adresse is not every time on the first page.
In the second loop, I scrall the page to find the email address, the code :
EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-zA-Z](?:[a-z0-9-]*[a-zA-Z])?\.)+[a-zA-Z](?:[a-z0-9-]*[a-zA-Z])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-zA-Z]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
I think my regex is too long, it can be a problem ?
session = HTMLSession()
mailing=[]
for index, i in enumerate(link): #link is the list of the URLs
try:
r = session.get(i)
site=r.html.absolute_links
linkslist = list(r.html.absolute_links)
except:
linkslist=list(i)
for j in linkslist:
try:
r1 = session.get(j)
for re_match in re.finditer(EMAIL_REGEX, r1.html.raw_html.decode()):
mail=(re_match.group())
liste=[index,mail,j]
mailing.append(liste)
except:
pass
print(mailing)
df = pd.DataFrame(mailing, columns=['index1','mail','lien',])
Thank's for your help
I think multi-threading should do the job. your regex, i don't know what it does but assuming its working and helpful, the multi-threaded version should look like the following. I tested the code, it works.
`from threading import Thread, Lock
from requests_html import HTMLSession
import re
lock = Lock()
link = ["http://etsgaidonsarl.site-solocal.com/",
"http://fr-fr.facebook.com/people/",
"http://ipm-mondia.com/",
"http://lfgenieclimatique.fr/",
"http://vpcinstallation.site-solocal.com",
"http://www.cavifroid.fr/",
"http://www.clim-monnier.com/",
"http://www.climacool.net/"]
linklist = []
mailing = []
main_threads = []
minor_threads = []
EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-zA-Z](?:[a-z0-9-]*[a-zA-Z])?\.)+[a-zA-Z](?:[a-z0-9-]*[a-zA-Z])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-zA-Z]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
def links_scraper(single_url):
try:
session = HTMLSession()
r = session.get(single_url)
site=r.html.absolute_links
the_list = list(r.html.absolute_links)
linklist.extend(list(zip([single_url for _ in range(len(the_list))], the_list)))
except Exception as e:
# print("Exception:", e)
linklist.append((single_url, single_url))
def mail_scrapper(main_url, single_link):
try:
session = HTMLSession()
r1 = session.get(single_link)
for re_match in re.finditer(EMAIL_REGEX, r1.html.raw_html.decode()):
mail=(re_match.group())
liste=[link.index(main_url),mail,single_link]
mailing.append(liste)
except Exception as e:
# print(f"Exception: {e}")
pass
def main():
for l in link:
t = Thread(target=links_scraper, args=(l,))
t.start()
main_threads.append(t)
while len(main_threads) > 0:
try:
with lock:
current_link = linklist.pop(0)
minor_thread = Thread(target=mail_scrapper, args=(current_link[0], current_link[1]))
minor_threads.append(minor_thread)
minor_thread.start()
except IndexError:
pass
for t in main_threads:
if t.isAlive() == False:
main_threads.pop(main_threads.index(t))
for t in minor_threads:
t.join()
main()
print("Mailing:", mailing)`
I am using requests to resolve urls for about 410K check-in data. However, the process hang somewhere for hours and I am not sure where the problem is. I did the same thing for 1.7M pieces of data before and it worked well. Here is my code:
pat = re.compile("(?P<url>https?://[^\s]+)") # always compile it
def resolve_url(text):
url = 'before'
long_url = 'after'
error = 'none'
match = pat.search(text)
if match:
url = match.group("url")
try:
long_url = requests.head(url, allow_redirects=True).url
except requests.exceptions.RequestException as e:
error = e
return (url, long_url, error)
pool = multiprocessing.Pool(200)
resolved_urls = []
for i, res in enumerate(pool.imap(resolve_url, text_with_url)):
resolved_urls.append(res)
if i%10000 == 0 and i > 0:
print("%d elements have been processed, %2.5f seconds" %(i+1, time.time()-t0))
fout = open("./yangj/resolved_urls_%d_requests.pkl"%(i+1),"w")
pickle.dump(resolved_urls, fout)
fout.close()
resolved_urls = []
fout = open("./yangj/resolved_urls_last_requests.pkl","w")
pickle.dump(resolved_urls, fout)
fout.close()
I was wondering whether the problem is because of some exception that I need to write code to recover. I have looked through requests documents and previous similar questions but I didn't find matching answers. Any idea to solve the problem?
I'm finding the boto dynamoDB documentation lacking almost completely of examples.
In Python, I simply want to output the contents of a table with a limit of a number of records, say 500 of the latest ones, from a certain date.
Here is what I have...
import boto.dynamodb
import sys
#----------PUBLIC VARIABLES--------------------------#
connection = boto.dynamodb.connect_to_region(
'us-east-1',
aws_access_key_id='somekey',
aws_secret_access_key='somesecretkey')
#----------------------------------------------------#
def info():
print('#########################_TABLE_NAMES_#########################')
#get and print list of tables
tablenames = connection.list_tables()
for table in tablenames:
print('DynamoDB table: %s' % table)
#print(connection.describe_table(table))
print('###############################################################' + '\n')
def main():
print('###########################_RESULTS_###########################')
scan = myTable.scan(scan_filter=None, attributes_to_get=['SomeField'])
results = []
for x in scan:
results.append(x['SomeField'])
print('###############################################################' + '\n')
def writeError(error):
try:
f = open("error.txt", "w")
try:
f.write(error) # Write a string to a file
finally:
f.close()
except IOError:
print "WriteError - Error!"
if __name__ == '__main__':
try:
info()
main()
except:
writeError("Unexpected error:" + str(sys.exc_info()))
print "Error"
The table I have hasn't got any custom indexes so I'd be looking for something pretty basic as an example.
I'm sorry I don't have a better attempt, but I've researched and not found a lot to go on.
I've modified your script to print out the first 500 scan results for each table. Don't forget to correct the field name (I put someField):
import boto.dynamodb2
from boto.dynamodb2.table import Table
import sys
#----------PUBLIC VARIABLES--------------------------#
connection = boto.dynamodb2.connect_to_region(
'us-east-1')
#----------------------------------------------------#
def getTableNames():
'''get list of tables'''
tablenames = connection.list_tables()["TableNames"]
return tablenames
def main(tablenames=[]):
print('###########################_RESULTS_###########################')
for table in tablenames:
print "Table Name: " + table
myTable = Table(table)
scan = myTable.scan()
results = []
for item in scan:
if len(results) >= 500:
break
results.append(item.get('someField'))
for result in results:
print result
print('###############################################################' + '\n')
def writeError(error):
try:
f = open("error.txt", "w")
try:
f.write(error) # Write a string to a file
finally:
f.close()
except IOError:
print "WriteError - Error!"
if __name__ == '__main__':
try:
tablenames = getTableNames()
main(tablenames)
except:
writeError("Unexpected error:" + str(sys.exc_info()))
print "Error"
Please note that DynamoDB doesn't provide scan results in any order. If you want them ordered by the latest changes, you can use a solution based on DynamoDB Streams https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Streams.html or add a secondary index: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/GSI.html