Extracting Gzip files from requests with multithreading - python

I'm trying to extract a bunch (62.000) gzip files. In these files is a text document formatted as JSON. Right now I'm downloading all these files with the requests module with multithreading:
def fetch_file(url,filename):
try:
html = requests.get(url,stream=True,allow_redirects=True)
open('Streams Total Daily/'+filename+'.json.gz','wb').write(html.content)
return html.status_code
except requests.exceptions.RequestException as e:
return e
def get_streams():
threads = []
with ThreadPoolExecutor(max_workers=10) as executor:
for uri in country_uris:
split = uri.split('/')
filename = 'streams_'+split[1] + '_' +split[4]+'_'+split[5]+'_'+split[6] + '_'+ split[7]
url = f"{link}{uri}?access_token={access_token}"
threads.append(executor.submit(fetch_file,url,filename))
for task in as_completed(threads):
print(task.result())
get_streams()
I have some code that can loop through the folder where the files are placed, but this takes a long time with 62000 files. I've tried some versions passing response.content through gzip.GzipFile(), but this just gives me empty files.
def fetch_file(url,filename):
try:
html = requests.get(url,stream=True,allow_redirects=True)
gz = gzip.GzipFile(fileobj=html.content)
with open('test/'+filename + '.json','wb') as out:
out.write(gz.read())
return html.status_code
except requests.exceptions.RequestException as e:
return e
def get_streams():
threads = []
with ThreadPoolExecutor(max_workers=10) as executor:
for uri in country_uris:
split = uri.split('/')
filename = 'streams_'+split[1] + '_' +split[4]+'_'+split[5]+'_'+split[6] + '_'+ split[7]
url = f"{link}{uri}?access_token={access_token}"
threads.append(executor.submit(fetch_file,url,filename))
for task in as_completed(threads):
print(task.result())
get_streams()
Does anyone have an idea on how to handle this? Any suggestions or solutions are much appreciated!

What worked for me, in the end, was the Zlib module. First getting a bytes response with response.content, then decompressing the data with zlib.decompress(response.content, 16 +zlib.MAX_WBITS) and finally writing the decompressed data to a .json file:
def get_files(i):
url = f"{url}{i}"
elements = i.split('/')
name = elements[1] + '_' +elements[3] + '_' + elements[4] + '_' + elements[5]+ '_' + elements[6] + '_' +elements[7]
try:
response = requests.get(url=url,headers=headers,allow_redirects=True,).content
decompressed_data = zlib.decompress(response, 16 + zlib.MAX_WBITS)
with open(f"Streams Total Daily/{name}.json",'wb') as out:
out.write(decompressed_data)
except requests.exceptions.RequestException as e:
return e
def runner():
threads =[]
with ThreadPoolExecutor(max_workers=10) as executor:
for i in country_files:
threads.append(executor.submit(get_files,i))
runner()

Related

Python Multiprocessing in web crawler

I am trying to implement multiprocessing in my web crawler, what I usually see online is sending the url as args into the function of map or map_async or apply_asyn. The data I am crawling is in the table, thus, I extract them by doing two times beautifulsoup find_all for row and column. Since the data I am crawling sometime is in one page which only require one url. I try to use the return list from Find_all as args for map_async, but the error occur showing "Fatal Python error: Cannot recover from stackoverflow."
The error occurred on the following line
return_list = pool.map_async(func, Species_all_recorded_data_List)
How could I solve it or where should the multiprocessing be implemented will be better?
The second problem is that if I put some code above the function crawl_all_data_mp, when it execute the pool = Pool(), all the code above will execute. I solved it by simply move all the other code under that function. It might not be correct since I still can't really run the code due to the first error.
Looking for your advice
My code:
(1) Function to call for web crawling
from tkinter import filedialog
from tkinter import *
import csv
import os.path
from os import path
from Index import *
from Dragonfly import *
import codecs
from multiprocessing import Process, Value
#\ multiprocessing ver
def multiprocessing_row_data(Web_rawl_Species_family_name, Web_rawl_Species_name, Total_num, Limit_CNT, expecting_CNT, oldID, page, Species_all_record_data_Data_Set):
global DataCNT, stop_crawl_all_data_mp
tmp_List = Species_all_record_data_Data_Set.find_all('td')
# End condition
# 1.no data in next page
# 2.for update to find unti the old data by inspecting its ID
# 3.if it count over the the limit count
id = tmp_List[0].text
if (len(id) == 0) or (DataCNT >= expecting_CNT)or (DataCNT >= Limit_CNT):
print(' --Finish crawl--' + ' crawl to page: ' + str(page) + ", ID: " + id + ", count: " + str(DataCNT))
stop_crawl_all_data_mp = True
raise StopIteration
# access the same value in memory when doing multiprocessing
with DataCNT.getlock():
DataCNT.value += 1
response_DetailedInfo = session.post(general_url + Detailed_discriptions_url + id, headers=headers)
soup2 = BeautifulSoup(response_DetailedInfo.text, 'html.parser')
print("Current finished datas >> " + str(DataCNT.value) + " /" + str(Total_num) + " (" + str(DataCNT.value * 100 / Total_num) + "%)", end='\r')
return DetailedTableInfo(tmp_List[0].text, tmp_List[1].text, tmp_List[2].text, tmp_List[3].text, tmp_List[4].text, tmp_List[5].text, tmp_List[7].text, tmp_List[6].text,
soup2.find(id='R_LAT').get('value'),
soup2.find(id='R_LNG').get('value'),
Web_rawl_Species_family_name,
Web_rawl_Species_name,
soup2.find(id='R_MEMO').get('value'))
def crawl_all_data_mp(Web_rawl_Species_family_name, Web_rawl_Species_name, Total_num, Limit_CNT, expecting_CNT, oldID):
page = 0
DataList = []
while not stop_crawl_all_data_mp:
pool = multiprocessing.Pool(10)
Species_all_recorded_data = session.post( general_url +
species_all_record_data_first_url +
species_all_record_data_page_url + str(page) +
species_all_record_data_species_url +
Species_class_key[Web_rawl_Species_family_name] +
Species_key[Web_rawl_Species_name],
headers=headers)
soup = BeautifulSoup(Species_all_recorded_data.text, 'html.parser')
Species_all_recorded_data_List = soup.find_all(id='theRow')
func = partial(multiprocessing_row_data, Web_rawl_Species_family_name, Web_rawl_Species_name, Total_num, Limit_CNT, expecting_CNT, oldID, page)
return_list = pool.map_async(func, Species_all_recorded_data_List)
DataList.append(list(filter(None, return_list.get())))
page += 1
# make sure whe main is finished, subfunctions still keep rolling on
pool.close()
pool.join()
return [DataList, page]
(2) main
it goes wrong on the following line for calling the function above
[datatmpList, page] = crawl_all_data_mp(Input_species_famliy, Input_species, Total_num, limit_cnt, expecting_CNT, oldID)
the main code:
# --main--
if __name__ == '__main__':
# setting
Input_species_famliy = "細蟌科"
Input_species = "四斑細蟌"
limit_cnt = 6000
folder = 'Crawl_Data\\' + Species_class_key[Input_species_famliy]
File_name = folder + "\\" + Species_class_key[Input_species_famliy] + Species_key[Input_species] +'.csv'
oldID = 0
oldData_len = 0
print("--Start crawl-- " + Input_species_famliy + " " + Input_species)
print("[folder]: " + folder)
stop_crawl_all_data_mp = False
# check the file exist or not
file_check = path.exists(current_path + "\\" + File_name)
# get the Old ID
if file_check:
file_size = os.stat(current_path + "\\" + File_name).st_size
if not file_size == 0:
with open(File_name, newline='', errors = "ignore") as F:
R = csv.reader(F)
oldData = [line for line in R]
oldID = oldData[0][0]
oldData_len = len(oldData)-1
# login
Login_Web(myaccount, mypassword)
# find the total number of the species_input (expect executed one time)
Species_total_num_Dict = Find_species_total_data()
# get the data
Total_num = int(Species_total_num_Dict[Input_species])
#[datatmpList, page] = crawl_all_data(Input_species_famliy, Input_species, Total_num, limit_cnt, oldID)
expecting_CNT = Total_num - oldData_len # get the total number of data need to be update ot crawl
[datatmpList, page] = crawl_all_data_mp(Input_species_famliy, Input_species, Total_num, limit_cnt, expecting_CNT, oldID)
Data = []
for Data_tmp in datatmpList:
Data.append([Data_tmp.SpeciesFamily,
Data_tmp.Species,
Data_tmp.IdNumber,
Data_tmp.Dates,
Data_tmp.Times,
Data_tmp.User,
Data_tmp.City,
Data_tmp.Dictrict,
Data_tmp.Place,
Data_tmp.Altitude,
Data_tmp.Latitude,
Data_tmp.Longitude,
Data_tmp.Description
])
#auto make the directories
newDir = current_path + "\\" + folder
if (not os.path.isdir(newDir)):
os.mkdir(newDir)
# 'a' stands for append, which can append the new data to old one
with open(File_name, mode='a', newline='', errors = "ignore") as employee_file:
employee_writer = csv.writer(employee_file, delimiter=',', quoting=csv.QUOTE_MINIMAL)
# init , for there is no file exists or the file is empty
if ((not file_check) or (file_size == 0)):
employee_writer.writerow(CSV_Head)
employee_writer.writerows(Data)
# for inserting the data into the old one
else:
for i in range(0, len(Data)):
oldData.insert(i, Data[i])
employee_writer.writerows(oldData)

Python check if website exists for a list of websites

I want to check if a website exists, given a list of websites in the format XXXXX.com, where XXXXX=a 5 digit number. So I want to go through from 00000 up to 99999 and see if those variants of the website exist.
I want to do something like
import requests
request = requests.get('http://www.example.com')
if request.status_code == 200:
print('Web site exists')
else:
print('Web site does not exist')
But generate a list of some sort (or even just export a list to csv), so for each URL, i know if it exists or not.
Any advice would be great!
I'm going to make an assumption that you have a large list of URLs and you want to read them in from some source file, let's say a text file, rather than hard-coding a large list of URLs in Python file, right. If that's the case, run the script below and you'll get what you want.
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
start = time.time()
file = open('C:\\your_path\\check_me.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
print(urls)
def checkurl(url):
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
# Return code error (e.g. 404, 501, ...)
# ...
print('HTTPError: {}'.format(e.code) + ', ' + url)
except urllib.error.URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
# ...
print('URLError: {}'.format(e.reason) + ', ' + url)
else:
# 200
# ...
print('good' + ', ' + url)
if __name__ == "__main__":
p = Pool(processes=20)
result = p.map(checkurl, urls)
print("done in : ", time.time()-start)
Try combining xrange and the string zfill method in a loop.
import requests
def test_for_200(url):
req = requests.get(url)
return req.status_code == 200
def numbers():
for n in xrange(100000):
yield str(n).zfill(5)
results = {}
for num in numbers():
url = "http://{}.com".format(num)
results[num] = test_for_200(url)
results will look something like this:
>>> results
{'00000': True, '00001': False, ...}

Mutiprocessing doesn't work in python web-scraping

I have done with web-scraping using beautifulsoup and successfully save the parsed data into csv files but I want to speed up the process so I use multiprocessing. But there is no difference after I apply multiprocessing in the script. Here is my code
rootPath = '....'
urlp1 = "https://www.proteinatlas.org/"
try:
df1 = pd.read_csv(rootPath + "cancer_list1_2(1).csv", header=0);
except Exception as e:
print("File " + f + " doesn't exist")
print(str(e))
sys.exit()
cancer_list = df1.as_matrix().tolist()
# [["bcla_gene","beast+cancer"], ...]
URLs = []
for cancer in cancer_list:
urlp2 = "/pathology/tissue/" + cancer[1]
f = cancer[0]
try:
df1 = pd.read_csv(rootPath + f + ".csv", header=0);
except Exception as e:
print("File " + f + " doesn't exist")
print(str(e))
sys.exit()
... # list of urls
def scrape(url,output_path):
page = urlopen(URL)
soup = BeautifulSoup(page, 'html.parser')
item_text = soup.select('#scatter6001 script')[0].text
table = soup.find_all('table',{'class':'noborder dark'})
df1 = pd.read_html(str(table),header = 0)
df1 = pd.DataFrame(df1[0])
Number = soup.find('th',text = "Number of samples").find_next_sibling("td").text
...
#function of scraping
if __name__ == "__main__":
Parallel(n_jobs=-1)(scrape(url,output_path) for url in URLs)
Just update the code and the problem now is the CPU utilization can reach 100% only at beginning but soon will drop to 1%. I'm quite confused about that.
Without going to any details in your code: You may benefit from having a look at the joblib module.
Pseudocode:
import joblib
if __name__ == "__main__":
URLs = ["URL1", "URL2", "URL2", ...]
Parallel(n_jobs=-1)(scrape(url,output_path) for url in URLs)
Refactoring your code may be necessary because joblib only works if no code runs outside any def: and if __name__ == "__main__":-branch.
n_jobs=-1 will start a number of processes equivalent to the number of cores on your machine. For further details, refer to joblib's documentation.
Using this approach together with selenium/geckodriver, it is possible scrape a pool of 10k URLs in less than an hour depending on your machine (I usually open 40-50 processes on a octacore machine with 64GB ram).

Python: requests hang for hours

I am using requests to resolve urls for about 410K check-in data. However, the process hang somewhere for hours and I am not sure where the problem is. I did the same thing for 1.7M pieces of data before and it worked well. Here is my code:
pat = re.compile("(?P<url>https?://[^\s]+)") # always compile it
def resolve_url(text):
url = 'before'
long_url = 'after'
error = 'none'
match = pat.search(text)
if match:
url = match.group("url")
try:
long_url = requests.head(url, allow_redirects=True).url
except requests.exceptions.RequestException as e:
error = e
return (url, long_url, error)
pool = multiprocessing.Pool(200)
resolved_urls = []
for i, res in enumerate(pool.imap(resolve_url, text_with_url)):
resolved_urls.append(res)
if i%10000 == 0 and i > 0:
print("%d elements have been processed, %2.5f seconds" %(i+1, time.time()-t0))
fout = open("./yangj/resolved_urls_%d_requests.pkl"%(i+1),"w")
pickle.dump(resolved_urls, fout)
fout.close()
resolved_urls = []
fout = open("./yangj/resolved_urls_last_requests.pkl","w")
pickle.dump(resolved_urls, fout)
fout.close()
I was wondering whether the problem is because of some exception that I need to write code to recover. I have looked through requests documents and previous similar questions but I didn't find matching answers. Any idea to solve the problem?

request empty result issue

I have this simple python code, which returning the content of URL and store the result as json text file named "file", but it keeps returning empty result .
What I am doing wrong here? It is just a simple code I am so disappointed.
I have included all the imports needed import Facebook,import request,and import json.
url ="https://graph.facebook.com/search?limit=5000&type=page&q=%26&access_token=xx&__after_id=139433456868"
content = requests.get(url).json()
file = open("file.txt" , 'w')
file.write(json.dumps(content, indent=1))
file.close()
but it keeps returning empty result to me what I am missing here?
here is the result:
"data": []
any help please?
Its working fine:
import urllib2
accesstoken="CAACEdEose0cBACF6HpTDEuVEwVnjx1sHOJFS3ZBQZBsoWqKKyFl93xwZCxKysqsQMUgOZBLjJoMurSxinn96pgbdfSYbyS9Hh3pULdED8Tv255RgnsYmnlxvR7JZCN7X25zP6fRnRK0ZCNmChfLPejaltwM2JGtPGYBQwnmAL9tQBKBmbZAkGYCEQHAbUf7k1YZD"
urllib2.urlopen("https://graph.facebook.com/search?limit=5000&type=page&q=%26&access_token="+accesstoken+"&__after_id=139433456868").read()
I think you have not requested access token before making the request.
How to find access token?
def getSecretToken(verification_code):
token_url = ( "https://graph.facebook.com/oauth/access_token?" +
"client_id=" + app_id +
"&redirect_uri=" +my_url +
"&client_secret=" + app_secret +
"&code=" + verification_code )
response = requests.get(token_url).content
params = {}
result = response.split("&", 1)
print result
for p in result:
(k,v) = p.split("=")
params[k] = v
return params['access_token']
how do you get that verification code?
verification_code=""
if "code" in request.query:
verification_code = request.query["code"]
if not verification_code:
dialog_url = ( "http://www.facebook.com/dialog/oauth?" +
"client_id=" + app_id +
"&redirect_uri=" + my_url +
"&scope=publish_stream" )
return "<script>top.location.href='" + dialog_url + "'</script>"
else:
access_token = getSecretToken(verification_code)

Categories

Resources