I have a list of 1000 dictionary each dictionary contains url,file_name
for di in images_to_download:
temp = download_image(di['img_url'],di['image_full_name'])
if not temp:
continue
upload_file(temp,t4['id'])
def download_image(img_url,image_full_name)
try:
req = request(img_url, headers=headers)
raw_img = urlopen(req).read()
try:
with open(image_full_name,'wb') as file:
file.write(raw_img)
except:
image_full_name = query+str(random.randint(100000))
with open(image_full_name,'wb') as file:
file.write(raw_img)
return image_full_name
except Exception as e:
print ("Download failed: {}".format(e))
return None
def upload_file(file_name,folder_id):
file2 = drive.CreateFile({'parents': [{'id': folder_id}]})
file2.SetContentFile(file_name)
file2.Upload()
This is my code so far it works well, but it is slow (3-6 sec per image), so I would like to thread it How can I do that?
The images are from different websites so I can download them without too fast requests on website.
Related
I need help to find email adress on website. After some research, I found the solution but it's so long, I have a lot of datas (more than 90 000) and my code never stop.
Do you know tips to optimize/accelerate my code ?
This is my list of the URL:
http://etsgaidonsarl.site-solocal.com/
http://fr-fr.facebook.com/people/
http://ipm-mondia.com/
http://lfgenieclimatique.fr/
http://vpcinstallation.site-solocal.com
http://www.cavifroid.fr/
http://www.clim-monnier.com/
http://www.climacool.net/
I use 2 loops. The first is to find all pages of a website because the email adresse is not every time on the first page.
In the second loop, I scrall the page to find the email address, the code :
EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-zA-Z](?:[a-z0-9-]*[a-zA-Z])?\.)+[a-zA-Z](?:[a-z0-9-]*[a-zA-Z])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-zA-Z]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
I think my regex is too long, it can be a problem ?
session = HTMLSession()
mailing=[]
for index, i in enumerate(link): #link is the list of the URLs
try:
r = session.get(i)
site=r.html.absolute_links
linkslist = list(r.html.absolute_links)
except:
linkslist=list(i)
for j in linkslist:
try:
r1 = session.get(j)
for re_match in re.finditer(EMAIL_REGEX, r1.html.raw_html.decode()):
mail=(re_match.group())
liste=[index,mail,j]
mailing.append(liste)
except:
pass
print(mailing)
df = pd.DataFrame(mailing, columns=['index1','mail','lien',])
Thank's for your help
I think multi-threading should do the job. your regex, i don't know what it does but assuming its working and helpful, the multi-threaded version should look like the following. I tested the code, it works.
`from threading import Thread, Lock
from requests_html import HTMLSession
import re
lock = Lock()
link = ["http://etsgaidonsarl.site-solocal.com/",
"http://fr-fr.facebook.com/people/",
"http://ipm-mondia.com/",
"http://lfgenieclimatique.fr/",
"http://vpcinstallation.site-solocal.com",
"http://www.cavifroid.fr/",
"http://www.clim-monnier.com/",
"http://www.climacool.net/"]
linklist = []
mailing = []
main_threads = []
minor_threads = []
EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-zA-Z](?:[a-z0-9-]*[a-zA-Z])?\.)+[a-zA-Z](?:[a-z0-9-]*[a-zA-Z])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-zA-Z]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
def links_scraper(single_url):
try:
session = HTMLSession()
r = session.get(single_url)
site=r.html.absolute_links
the_list = list(r.html.absolute_links)
linklist.extend(list(zip([single_url for _ in range(len(the_list))], the_list)))
except Exception as e:
# print("Exception:", e)
linklist.append((single_url, single_url))
def mail_scrapper(main_url, single_link):
try:
session = HTMLSession()
r1 = session.get(single_link)
for re_match in re.finditer(EMAIL_REGEX, r1.html.raw_html.decode()):
mail=(re_match.group())
liste=[link.index(main_url),mail,single_link]
mailing.append(liste)
except Exception as e:
# print(f"Exception: {e}")
pass
def main():
for l in link:
t = Thread(target=links_scraper, args=(l,))
t.start()
main_threads.append(t)
while len(main_threads) > 0:
try:
with lock:
current_link = linklist.pop(0)
minor_thread = Thread(target=mail_scrapper, args=(current_link[0], current_link[1]))
minor_threads.append(minor_thread)
minor_thread.start()
except IndexError:
pass
for t in main_threads:
if t.isAlive() == False:
main_threads.pop(main_threads.index(t))
for t in minor_threads:
t.join()
main()
print("Mailing:", mailing)`
I generated the following code where I pass multiple URLS via API and ask that there be an output written to different pandas dataframes. It (sort of) works, but the outcome is incorrect
1) It seems to enter the function and print "Success" way too many times. Why?
2) The output for all the dataframes is the same; not sure where the error is.
See the function:
def data_extract(url):
payload = {'limit':'200000'}
# Persists parameters across requests
s = requests.Session()
# To determine success of request, and error code
for url in url:
try:
response = s.get(url)
# If the response was successful, no Exception will be raised
response.raise_for_status()
except HTTPError as http_err:
print(f'HTTP error occurred: {http_err}')
except Exception as err:
print(f'Other error occurred: {err}')
else:
# Ret
jsonData = s.get(url, params=payload).json()
print('Success!')
df_tmr = pd.DataFrame(jsonData['records'])
return df_tmr
See the call to the function:
urls = {
# Rainfall data
'tot_rain_mth': 'https://data.gov.sg/dataset/5942f8bd-4240-4f68-acd2-a5a276958237/resource/778814b8-1b96-404b-9ac9-68d6c00e637b/data',
'no_days_rain_mth': 'https://data.gov.sg/dataset/rainfall-monthly-number-of-rain-days/resource/8b94f596-91fd-4545-bf9e-7a426493b674/data',
'max_rain_mth': 'https://data.gov.sg/dataset/rainfall-monthly-maximum-daily-total/resource/df4d391e-6950-4fc6-80cd-c9b9ef6354fe/data',
# Temperature Data
'mean_sun_dur_mth': 'https://data.gov.sg/dataset/sunshine-duration-monthly-mean-daily-duration/resource/0230819f-1c83-4980-b738-56136d6dc300/data',
'wet_bulb_hr': 'https://data.gov.sg/dataset/wet-bulb-temperature-hourly/resource/0195dc7a-2f49-4107-ac7c-3112ca4a09a8/data',
'min_air_temp_day': 'https://data.gov.sg/dataset/surface-air-temperature-mean-daily-minimum/resource/ad0d8a97-9321-42e9-ac6f-46bf12845d44/data',
'min_air_temp_mth': 'https://data.gov.sg/dataset/surface-air-temperature-monthly-absolute-extreme-minimum/resource/0c5b9752-2488-46cc-ae1c-42318d0f8865/data',
'mean_air_temp_mth': 'https://data.gov.sg/dataset/surface-air-temperature-monthly-mean/resource/07654ce7-f97f-49c9-81c6-bd41beba4e96/data',
'max_air_temp_day': 'https://data.gov.sg/dataset/surface-air-temperature-mean-daily-maximum/resource/c7a7d2fd-9d32-4508-92ef-d1019e030a2f/data',
'max_air_temp_mth': 'https://data.gov.sg/dataset/air-temperature-absolute-extremes-maximum/resource/96e66346-68bb-4ca9-b001-58bbf39e36a7/data',
# Humidity Data
'min_hum_mth': 'https://data.gov.sg/dataset/relative-humidity-monthly-absolute-extreme-minimum/resource/585c24a5-76cd-4c48-9341-9223de5adc1d/data',
'mean_hum_mth': 'https://data.gov.sg/dataset/relative-humidity-monthly-mean/resource/4631174f-9858-463d-8a88-f3cb21588c67/data',
'mean_hum_yr': 'https://data.gov.sg/dataset/relative-humidity-annual-mean/resource/77b9059f-cc9a-4f4f-a495-9c268945191b/data'
}
df={}
for i in range(len(urls.keys())):
df[str(i)] = pd.DataFrame()
#print('Name of Dataframe:', df)
df[str(i)] = data_extract(urls.values())
print (df['0'])
print (df['1'])
--> Sorry about the bad formatting; cant quite get it right in SO
import requests
import pandas as pd
def data_extract(url):
print(url)
payload = {'limit':'200000'}
s = requests.Session()
try:
response = s.get(url)
response.raise_for_status()
jsonData = s.get(url, params=payload).json()
print('Success!')
except Exception as err:
print(f'Other error occurred: {err}')
df_tmr = pd.DataFrame(jsonData['records'])
return df_tmr
urls = {
# Rainfall data
'tot_rain_mth': 'https://data.gov.sg/dataset/5942f8bd-4240-4f68-acd2-a5a276958237/resource/778814b8-1b96-404b-9ac9-68d6c00e637b/data',
'no_days_rain_mth': 'https://data.gov.sg/dataset/rainfall-monthly-number-of-rain-days/resource/8b94f596-91fd-4545-bf9e-7a426493b674/data',
'max_rain_mth': 'https://data.gov.sg/dataset/rainfall-monthly-maximum-daily-total/resource/df4d391e-6950-4fc6-80cd-c9b9ef6354fe/data',
# Temperature Data
'mean_sun_dur_mth': 'https://data.gov.sg/dataset/sunshine-duration-monthly-mean-daily-duration/resource/0230819f-1c83-4980-b738-56136d6dc300/data',
'wet_bulb_hr': 'https://data.gov.sg/dataset/wet-bulb-temperature-hourly/resource/0195dc7a-2f49-4107-ac7c-3112ca4a09a8/data',
'min_air_temp_day': 'https://data.gov.sg/dataset/surface-air-temperature-mean-daily-minimum/resource/ad0d8a97-9321-42e9-ac6f-46bf12845d44/data',
'min_air_temp_mth': 'https://data.gov.sg/dataset/surface-air-temperature-monthly-absolute-extreme-minimum/resource/0c5b9752-2488-46cc-ae1c-42318d0f8865/data',
'mean_air_temp_mth': 'https://data.gov.sg/dataset/surface-air-temperature-monthly-mean/resource/07654ce7-f97f-49c9-81c6-bd41beba4e96/data',
'max_air_temp_day': 'https://data.gov.sg/dataset/surface-air-temperature-mean-daily-maximum/resource/c7a7d2fd-9d32-4508-92ef-d1019e030a2f/data',
'max_air_temp_mth': 'https://data.gov.sg/dataset/air-temperature-absolute-extremes-maximum/resource/96e66346-68bb-4ca9-b001-58bbf39e36a7/data',
# Humidity Data
'min_hum_mth': 'https://data.gov.sg/dataset/relative-humidity-monthly-absolute-extreme-minimum/resource/585c24a5-76cd-4c48-9341-9223de5adc1d/data',
'mean_hum_mth': 'https://data.gov.sg/dataset/relative-humidity-monthly-mean/resource/4631174f-9858-463d-8a88-f3cb21588c67/data',
'mean_hum_yr': 'https://data.gov.sg/dataset/relative-humidity-annual-mean/resource/77b9059f-cc9a-4f4f-a495-9c268945191b/data'
}
df={}
temp = list(urls.values())
for i in range(len(temp)):
df[str(i)] = data_extract(temp[i])
print(df['0'])
print(df['1'])
if len(df) == len(temp):
print('success')
I think this will help you. You where iterating over all items and returning only last item as you. Just need to remove for loop from data_extract method.
I want to check if a website exists, given a list of websites in the format XXXXX.com, where XXXXX=a 5 digit number. So I want to go through from 00000 up to 99999 and see if those variants of the website exist.
I want to do something like
import requests
request = requests.get('http://www.example.com')
if request.status_code == 200:
print('Web site exists')
else:
print('Web site does not exist')
But generate a list of some sort (or even just export a list to csv), so for each URL, i know if it exists or not.
Any advice would be great!
I'm going to make an assumption that you have a large list of URLs and you want to read them in from some source file, let's say a text file, rather than hard-coding a large list of URLs in Python file, right. If that's the case, run the script below and you'll get what you want.
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
start = time.time()
file = open('C:\\your_path\\check_me.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
print(urls)
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
# Return code error (e.g. 404, 501, ...)
# ...
print('HTTPError: {}'.format(e.code) + ', ' + url)
except urllib.error.URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
# ...
print('URLError: {}'.format(e.reason) + ', ' + url)
else:
# 200
# ...
print('good' + ', ' + url)
if __name__ == "__main__":
p = Pool(processes=20)
result = p.map(checkurl, urls)
print("done in : ", time.time()-start)
Try combining xrange and the string zfill method in a loop.
import requests
def test_for_200(url):
req = requests.get(url)
return req.status_code == 200
def numbers():
for n in xrange(100000):
yield str(n).zfill(5)
results = {}
for num in numbers():
url = "http://{}.com".format(num)
results[num] = test_for_200(url)
results will look something like this:
>>> results
{'00000': True, '00001': False, ...}
I've written a program that fetches the desired information from a blog or any page. The next thing, I want to achieve is to retrieve the first image from that page, that belongs to the respective post (Just like Facebook does when a post is shared).
I was able to achieve this to some extent by fetching the first image with an alt tag (since many websites don't have alt tags in their logos and icons etc, the first one should belong to the post). But this does not seem to work in some cases. Is there any other (better) way to achieve this?
I'm using python 2.7.9 and BeautifulSoup 4.
d = feedparser.parse('http://rss.cnn.com/rss/edition.rss')
for entry in d.entries:
try:
if entry.title is not None:
print entry.title
print ""
except Exception, e:
print e
try:
if entry.link is not None:
print entry.link
print ""
except Exception, e:
print e
try:
if entry.published[5:16] is not None:
print entry.published[5:16]
print ""
except Exception, e:
print e
try:
if entry.category is not None:
print entry.category
print ""
except Exception, e:
print e
try:
if entry.get('summary', '') is not None:
print entry.get('summary', '')
print ""
except Exception, e:
print e
time.sleep(5)
r = requests.get(entry.link, headers = {'User-Agent' : 'Safari/534.55.3 '})
soup = BeautifulSoup(r.text, 'html.parser')
for img in soup.findAll('img'):
if img.has_attr('alt'):
if img['src'].endswith('.jpg') == True or img['src'].endswith('.png') == True:
print img['src']
break
It is probably more practical to take a look at the opengraph module:
https://pypi.python.org/pypi/opengraph/0.5
and correct it the way you like.
It will fetch "first image" from HTML code or use og:image.
If you want to learn, you can also do it by looking at the source code. The module uses BeautifulSoup too.
I needed the following monkeypatch to activate scraping as fallback:
import re
from bs4 import BeautifulSoup
from opengraph import OpenGraph
def parser(self, html):
"""
"""
if not isinstance(html,BeautifulSoup):
doc = BeautifulSoup(html, from_encoding='utf-8')
else:
doc = html
ogs = doc.html.head.findAll(property=re.compile(r'^og'))
for og in ogs:
self[og[u'property'][3:]]=og[u'content']
# Couldn't fetch all attrs from og tags, try scraping body
if not self.is_valid() and self.scrape:
for attr in self.required_attrs:
if not hasattr(self, attr):
try:
self[attr] = getattr(self, 'scrape_%s' % attr)(doc)
except AttributeError:
pass
OpenGraph.parser = parser
OpenGraph.scrape = True # workaround for some subtle bug in opengraph
You may need to handle relatives URLs in the image sources, but it is quite straightforward with use of urljoin from urlparse
import opengraph
...
page = opengraph.OpenGraph(url=link, scrape=True)
...
if page.is_valid():
...
image_url = page.get('image', None)
...
if not image_url.startswith('http'):
image_url = urljoin(page['_url'], page['image'])
(some check are omitted for brevity from the code fragment)
I just created a python program which scrapes google webmaster API to check if the target_site is mobile friendly or not & then based on response it extracts certain Json elements. Also it captures the screenshot on local folder
the script is working fine,BUT When i am trying to write those json objects to csv file ,its not working .
Here is my code :-
import requests, json, string, random, time
import csv
from base64 import decodestring
from random import randint
#links = open(r'D:\\Carlos\\Links.txt')
links = ['https://www.googleapis.com/pagespeedonline/v3beta1/mobileReady?key=AIzaSyDkEX-f1JNLQLC164SZaobALqFv4PHV-kA&screenshot=true&snapshots=true&locale=en_US&url=https://www.economicalinsurance.com/en/&strategy=mobile&filter_third_party_resources=false',
'https://www.googleapis.com/pagespeedonline/v3beta1/mobileReady?key=AIzaSyDkEX-f1JNLQLC164SZaobALqFv4PHV-kA&screenshot=true&snapshots=true&locale=en_US&url=http://www.volkswagen-me.com/en-vwme/service/protection/motor-insurance.html&strategy=mobile&filter_third_party_resources=false']
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
i = 12
def get_data(each):
try:
r = requests.get(each)
except:
pass
#time.sleep(randint(1, 3))
try:
json_data = json.loads(r.text)
except:
pass
try:
score = json_data['ruleGroups']['USABILITY']['score'];score=int(score)
except:
pass
try:
Pass = json_data['ruleGroups']['USABILITY']['pass'];Pass=str(Pass)
except:
pass
try:
ConfigureViewport = json_data['formattedResults']['ruleResults']['ConfigureViewport']['localizedRuleName'];ConfigureViewport=str(ConfigureViewport)
except:
pass
try:
UseLegibleFontSizes = json_data['formattedResults']['ruleResults']['UseLegibleFontSizes']['localizedRuleName'];UseLegibleFontSizes=str(UseLegibleFontSizes)
except:
pass
try:
AvoidPlugins = json_data['formattedResults']['ruleResults']['AvoidPlugins']['localizedRuleName'];AvoidPlugins=str(AvoidPlugins)
except:
pass
try:
SizeContentToViewport = json_data['formattedResults']['ruleResults']['SizeContentToViewport']['localizedRuleName'];SizeContentToViewport=str(SizeContentToViewport)
except:
pass
try:
SizeTapTargetsAppropriately = json_data['formattedResults']['ruleResults']['SizeTapTargetsAppropriately']['localizedRuleName'];SizeTapTargetsAppropriately=str(SizeTapTargetsAppropriately)
except:
pass
try:
AvoidInterstitials = json_data['formattedResults']['ruleResults']['AvoidInterstitials']['localizedRuleName'];AvoidInterstitials=str(AvoidInterstitials)
except:
pass
try:
image_link = json_data['screenshot']['data']; image_link = image_link.replace("_", "/").replace("-","+")
except:
pass
#try:
id_generator_name = "".join( [random.choice(string.letters) for i in xrange(15)] )+'.jpeg'
#except:
# pass
#try:
fh = open(id_generator_name, "wb")
#except:
# pass
try:
fh.write(str(image_link).decode('base64'))
time.sleep(1)
except:
pass
try:
fh.close()
except:
pass
try:
error_code = json_data['error']['message'];error_code=str(error_code)
except:
pass
try:
print each, score, Pass, ConfigureViewport, UseLegibleFontSizes, AvoidPlugins, SizeContentToViewport, SizeTapTargetsAppropriately, AvoidInterstitials, error_code
except:
pass
try:
writer.writerow({'each':each, 'score':score, 'Pass':Pass, 'ConfigureViewport':ConfigureViewport,
'UseLegibleFontSizes':UseLegibleFontSizes, 'AvoidPlugins':AvoidPlugins,
'SizeContentToViewport':SizeContentToViewport,'SizeTapTargetsAppropriately':SizeTapTargetsAppropriately,
'AvoidInterstitials':AvoidInterstitials, 'error_code':error_code,'imagename':id_generator_name})
except:
pass
#path to the csv file
with open("D:\Carlos\Data_file\output.csv", "ab")as export:
fieldnames = ['each', 'score', 'Pass', 'ConfigureViewport', 'UseLegibleFontSizes', 'AvoidPlugins', 'SizeContentToViewport',
'SizeTapTargetsAppropriately', 'AvoidInterstitials', 'error_code','imagename']
writer = csv.DictWriter(export, fieldnames=fieldnames)
writer.writeheader()
for each in links:
#try:
get_data(each)
#except:
# pass
Please advice on how to write to csv ? Or where things are wrong in the code ?
I like to use Pandas dataframes for this, but it may be overkill if you wouldn't use Pandas otherwise. Pandas dataframes are also great for analysis and comparison.
You would put the JSON into a dataframe, and then output the dataframe to a CSV file.
import pandas as pd
df = pd.read_json('path/to/json/file')
df.to_csv('filename.csv')
Note that it's this simple only when your JSON has one level and might as well be a csv. Otherwise, you would need to read the JSON into a dict, navigate to the appropriate level and then read that into a dataframe.
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
http://pandas.pydata.org/