Multithreading Scrape Html and Safely Save to One File

Multithreading Scrape Html and Safely Save to One File - python

I want scrape the title from given url in multiple thread (example in 5 thread)
and save them to one text file. how to do it and how to make sure I safely save the output to one file?
this is my code:
import csv
import requests
requests.packages.urllib3.disable_warnings()
urls = []
with open('Input.csv') as csvDataFile:
csvReader = csv.reader(csvDataFile)
for row in csvReader:
urls.append(row[1])
def find_between( s, first, last ):
try:
start = s.index( first ) + len( first )
end = s.index( last, start )
return s[start:end]
except ValueError:
return ""
def get_title( url ):
try:
r = requests.get(url)
html_content = r.text.encode('UTF-8')
title = find_between(html_content , "<title>", "</title>")
return title
except:
return ""
for url in urls:
f = open('myfile.txt', 'a')
f.write(get_title(url) + '\n')
f.close()

try to use futures
1. create pool
2. sumbit function and parameters
3. get result from function
import csv
from concurrent import futures
pool = futures.ThreadPoolExecutor(5)
workers = [pool.sumbit(get_title,url) for url in urls]
while not all(worker.done() for worker in workers):
pass
with open(file) as f:
w = csv.writer(f)
w.writerows([[worker.result()] for worker in workers])

Related

Multi threading not processing full list

I am using multi-threading to visit links read from a csv, strangely irrespective of the max-workers or even when I remove the multi-threading part, the code runs for an arbitrarily lower number of urls than in the list. I print the list to verify the count. For e.g if the list has 5000 urls, the code stops at 4084, if the links are 13,000 it will stop at 9200, even when it is just 130 links it will stop at 80 or something. What am I doing wrong here?
import requests
import xlrd
import concurrent.futures
from bs4 import BeautifulSoup
import csv
header_added = False
file_location = "Urls.xlsx"
workbook = xlrd.open_workbook(file_location)
sheet = workbook.sheet_by_index(0)
all_links = []
for row in range(1, 11000):
all_links.append(sheet.cell_value(row,0))
print(len(all_links))
i = 0
def get_solution(url):
global header_added, i
page = requests.get(url).text
soup = BeautifulSoup(page, 'html.parser')
ques_div = soup.find('p', class_='header-description')
ques = ques_div.find('span').text
ans_divs = soup.findAll('div', class_='puzzle-solution')
ans = ans_divs[0].text
print("Solution ", i)
i += 1
dict1 ={"Words": ques, "Solution": ans}
with open('Results10k.csv', 'a+', encoding='utf-8') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
result = executor.map(get_solution, all_links)

Here's a reworking of your code that doesn't need locks – instead, there's only ever one process that writes to the file.
Also, due to the GIL, using a ThreadPool will be slower than a process-backed Pool.
import csv
import multiprocessing
import requests
import xlrd
from bs4 import BeautifulSoup
sess = requests.Session()
def get_solution(url):
try:
resp = sess.get(url)
resp.raise_for_status()
page = resp.text
soup = BeautifulSoup(page, "html.parser")
ques_div = soup.find("p", class_="header-description")
ques = ques_div.find("span").text.strip()
ans_divs = soup.findAll("div", class_="puzzle-solution")
ans = ans_divs[0].text.strip()
return {"URL": url, "Words": ques, "Solution": ans, "Error": ""}
except Exception as exc:
print(url, "Error:", exc)
return {"URL": url, "Words": "", "Solution": "", "Error": str(exc)}
def read_links(file_location):
workbook = xlrd.open_workbook(file_location)
sheet = workbook.sheet_by_index(0)
all_links = []
for row in range(1, 11000):
all_links.append(sheet.cell_value(row, 0))
return all_links
def main():
links = read_links("./Urls.xlsx")
with open("Results10k.csv", "w", encoding="utf-8") as f:
with multiprocessing.Pool() as p: # (or multiprocessing.pool.ThreadPool)
for i, result in enumerate(p.imap_unordered(get_solution, links, chunksize=16)):
if i == 0:
writer = csv.DictWriter(f, result.keys())
writer.writeheader()
writer.writerow(result)
f.flush() # Ensure changes are written immediately
if i % 100 == 0: # Progress indicator
print(i)
if __name__ == "__main__":
main()

It could be, that get_solution() crashes for some of the URLs. You could add a try/except in the body of the function and write all crashed URLS to a different file.
def get_solution(url):
try:
...
except:
with open('errors.txt','a+') as f:
f.write(url+'\n')
If this is the problem the numbers should add up to the total number.
Also open() is probably not thread safe.
file_lock = threading.Lock()
def get_solution(url):
with file_lock:
with open('Results10k.csv', 'a+', encoding='utf-8') as f:
w = csv.DictWriter(f, dict1.keys())
...

python script failling to read csvfile with error - StopIteration

I am working on script which downloads large audit logs csv file from azure DevOps and filters data according given condition. This works for small csv file but for file with large data it fails with
fields = next(reader)
stopIteration
Can someone help with changes required in script? I am using python 3.7.9 on MacOs
def getproject(url,pat):
response = requests.get(url, auth=HTTPBasicAuth(username='',password=pat))
if response.status_code == 200:
url_data = response.content
tempfile = open("temp.csv","wb")
tempfile.write(url_data)
tempfile.close()
return url_data
else:
print("\nERROR : Unable to conect The server...")
def FilterData():
lists =[]
pro_name=[]
RepoId =[]
RepoName=[]
new_file = open("temp_new.csv", 'w',newline='')
writer = csv.writer(new_file)
with open("temp.csv", 'r') as readFile:
reader = csv.reader(readFile)
fields = next(reader)
lists.append(fields)
for row in reader:
for field in row:
if field == "Git.RepositoryCreated":
lists.append(row)
writer.writerows(lists)
readFile.close()
new_file.close()
os.remove("temp.csv")
timestamp = (datetime.datetime.now())
timestamp = timestamp.strftime("%d%B%Y_%H%M%S")
file_name = "Data2_"+str(timestamp)+".csv"
file1 = open("temp_new.csv",'r')
df = pd.read_csv(file1)
for i in df["Data"]:
res = json.loads(i)
pro_name.append(res['ProjectName'])
RepoId.append(res['RepoId'])
RepoName.append(res['RepoName'])
Disp_Name = df["ActorDisplayName"]
ActionId = df["ActionId"]
TimeStamp = df["Timestamp"]
file1.close()
os.remove("temp_new.csv")
Header = ["Actor Display Name","Project
Name","RepoName","RepoId","ActionId","Timestamp"]
d=[Disp_Name,pro_name,RepoName,RepoId,ActionId,TimeStamp]
export_data = zip_longest(*d, fillvalue = '')
with open(file_name, 'w',newline='') as myfile:
wr = csv.writer(myfile)
wr.writerow(Header)
wr.writerows(export_data)
myfile.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser("This is used for getting list of the projects")
parser.add_argument("-o" , dest="org", help="org name")
parser.add_argument("-p" , dest="pat", help="pat value")
parser.add_argument("-sd" , dest="sdate", help="Start Date")
parser.add_argument("-ed" , dest="edate", help="End Date")
args = parser.parse_args()
org = args.org
token = args.pat
startdate = args.sdate
enddate = args.edate
url = "https://auditservice.dev.azure.com/{org_name}/_apis/audit/downloadlog?
format=csv&startTime={startdt}&endTime={enddt}&api-version=6.1-
preview.1".format(org_name=org,startdt=startdate,enddt=enddate)
#call "getproject" function to check url and token to further create required csv
getproject(url,token)
FilterData()

[+] in your getproject function,
you should use a try except block to handle http errors etc.
[+] if the csv file you're trying to download is quite large, it may be best to write the data in chunks.
As for the fields = next(reader) stopIteration errpr.
I'm not sure. ¯_(ツ)_/¯
Try throwing your code in the debugger and stepping through it.
See: download large file in python with requests
def getproject(url,pat):
try:
# NOTE the stream=True parameter below
with requests.get(url, auth=HTTPBasicAuth(username='',password=pat), stream=True) as r:
r.raise_for_status()
with open('tmp.csv', 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
#if chunk:
f.write(chunk)
except requests.exceptions.ConnectionError as c_error:
print(f"[-] Connection Error: {c_error}")
except requests.exceptions.Timeout as t_error:
print(f"[-] Connection Timeout Error: {t_error}")
except requests.exceptions.RequestException as req_error:
print(f"[-] Some Ambiguous Exception: {req_error}")
# This way seems faster based upon the comments of the link i shared
import requests
import shutil
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url, stream=True) as r:
with open(local_filename, 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename

Can't write in csv file

When I try to write the information in the csv file, error is thrown:
Traceback (most recent call last):
File "sizeer.py", line 68, in <module>
writer.writerow([name,color,price])
ValueError: I/O operation on closed file
import requests
import csv
from bs4 import BeautifulSoup
proxies = {
"http":"http://195.189.60.97:3128",
"http":"http://103.78.75.165:8080",
"http":"http://212.87.220.2:3128",
"http":"http://88.99.134.61:8080",
"http":"http://103.102.139.178:8080",
"http":"http://218.60.8.83:3129",
"http":"http://124.121.105.193:8888",
"http":"http://198.237.114.54:8080",
"http":"http://36.67.106.58:8080",
"http":"http://35.214.241.28:3128"
}
base_url = ...
page = requests.get(base_url, proxies=proxies)
if page.status_code != 200:
exit("Page wasn't parsed")
soup = BeautifulSoup(page.content, 'lxml')
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
#Get categories
category_wrapper = soup.find_all(class_="m-menu_subItem")
categories = []
for cw in category_wrapper:
anchor = cw.find("a", recursive=False)
categories.append(anchor['href'])
#Iterrate categories
for category in categories:
cat_page = requests.get(base_url + category, proxies=proxies)
cat_soup = BeautifulSoup(cat_page.content, 'lxml')
products_wrapper = cat_soup.find(class_="b-productList")
cat_pagination = products_wrapper.find(class_="m-pagination").find_all("span")
max_page = [int(s) for s in cat_pagination[-1].text.split() if s.isdigit()][0]
#Iterrate category with pagination and get products
for i in range(1, max_page+1):
cat_pagination_page = requests.get(base_url+category+"/?sort=default&limit=60&page="+str(i), proxies=proxies)
cat_pagination_page_soup = BeautifulSoup(cat_pagination_page.content, 'lxml')
product_links = cat_pagination_page_soup.find_all(class_="b-itemList_photoLink")
for link in product_links:
#Get product data
product_page = requests.get(base_url+link['href'], proxies=proxies)
product_soup = BeautifulSoup(product_page.content, 'lxml')
#Get product variations
variations = product_soup.find_all(class_="m-productDescr_colorItem")
#If there are variations
if len(variations) > 0:
for v in variations:
variation_page = requests.get(base_url+v['href'], proxies=proxies)
variation_soup = BeautifulSoup(variation_page.content, 'lxml')
price = variation_soup.find(class_="s-newPrice").text.strip().split(" ")[0]
name = variation_soup.find(class_="m-productDescr_headline").text.strip()
color = v['title']
print(name)
print(color)
print(price)
print("-------------")
#Save in csv
writer.writerow([name,color,price])
print("SCRAPING DONE")
How to keep the file open through the whole script execution ? Or I have to open it every time I am adding content ? EDIT In fact, the file is not even created.

with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
The file closes at the end of the with block - that is the block's purpose.
You could put everything inside the block, but that only makes the existing problem worse: the code is reaching several levels of indents, is long and becomes difficult to understand. This is why you use functions to organize the code. For example, if you have the big for loop set in a function:
def do_stuff_with(categories, writer):
for category in categories:
# lots of logic here
# use `writer.writerow` when needed
# Get everything else set up that doesn't need the file, first
categories = ... # do the BeautifulSoup input stuff
# then we can open the file and use the function:
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
do_stuff_with(categories, writer)
Once you have that working, you can probably think of ways to apply the technique further. For example, pull out the innermost logic, for handling the variations for a single product. Or you can have a function to handle the creation of the categories data, and return it.

Can't figure out how to properly output my data

I'm a relative novice at python but yet, somehow managed to build a scraper for Instagram. I now want to take this one step further and output the 5 most commonly used hashtags from an IG profile into my CSV output file.
Current output:
I've managed to isolate the 5 most commonly used hashtags, but I get this result in my csv:
[('#striveforgreatness', 3), ('#jamesgang', 3), ('#thekidfromakron',
2), ('#togetherwecanchangetheworld', 1), ('#halloweenchronicles', 1)]
Desired output:
What I'm looking to end up with in the end is having 5 columns at the end of my .CSV outputting the X-th most commonly used value.
So something in the lines of this:
I've Googled for a while and managed to isolate them separately, but I always end up with '('#thekidfromakron', 2)' as an output. I seem to be missing some part of the puzzle :(.
Here is what I'm working with at the moment:
import csv
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from collections import Counter
ts = time.gmtime()
def get_csv_header(top_numb):
fieldnames = ['USER','MEDIA COUNT','FOLLOWERCOUNT','TOTAL LIKES','TOTAL COMMENTS','ER','ER IN %', 'BIO', 'ALL CAPTION TEXT','HASHTAGS COUNTED','MOST COMMON HASHTAGS']
return fieldnames
def write_csv_header(filename, headers):
with open(filename, 'w', newline='') as f_out:
writer = csv.DictWriter(f_out, fieldnames=headers)
writer.writeheader()
return
def read_user_name(t_file):
with open(t_file) as f:
user_list = f.read().splitlines()
return user_list
if __name__ == '__main__':
# HERE YOU CAN SPECIFY YOUR USERLIST FILE NAME,
# Which contains a list of usernames's BY DEFAULT <current working directory>/userlist.txt
USER_FILE = 'userlist.txt'
# HERE YOU CAN SPECIFY YOUR DATA FILE NAME, BY DEFAULT (data.csv)', Where your final result stays
DATA_FILE = 'users_with_er.csv'
MAX_POST = 12 # MAX POST
print('Starting the engagement calculations... Please wait until it finishes!')
users = read_user_name(USER_FILE)
""" Writing data to csv file """
csv_headers = get_csv_header(MAX_POST)
write_csv_header(DATA_FILE, csv_headers)
for user in users:
post_info = {'USER': user}
url = 'https://www.instagram.com/' + user + '/'
#for troubleshooting, un-comment the next two lines:
#print(user)
#print(url)
try:
r = requests.get(url)
if r.status_code != 200:
print(timestamp,' user {0} not found or page unavailable! Skipping...'.format(user))
continue
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
timestamp = time.strftime("%d-%m-%Y %H:%M:%S", ts)
except ValueError:
print(timestamp,'ValueError for username {0}...Skipping...'.format(user))
continue
except IndexError as error:
# Output expected IndexErrors.
print(timestamp, error)
continue
if j['graphql']['user']['edge_followed_by']['count'] <=0:
print(timestamp,'user {0} has no followers! Skipping...'.format(user))
continue
if j['graphql']['user']['edge_owner_to_timeline_media']['count'] <12:
print(timestamp,'user {0} has less than 12 posts! Skipping...'.format(user))
continue
if j['graphql']['user']['is_private'] is True:
print(timestamp,'user {0} has a private profile! Skipping...'.format(user))
continue
media_count = j['graphql']['user']['edge_owner_to_timeline_media']['count']
accountname = j['graphql']['user']['username']
followercount = j['graphql']['user']['edge_followed_by']['count']
bio = j['graphql']['user']['biography']
i = 0
total_likes = 0
total_comments = 0
all_captiontext = ''
while i <= 11:
total_likes += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_liked_by']['count']
total_comments += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_comment']['count']
captions = j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_caption']
caption_detail = captions['edges'][0]['node']['text']
all_captiontext += caption_detail
i += 1
engagement_rate_percentage = '{0:.4f}'.format((((total_likes + total_comments) / followercount)/12)*100) + '%'
engagement_rate = (((total_likes + total_comments) / followercount)/12*100)
#isolate and count hashtags
hashtags = re.findall(r'#\w*', all_captiontext)
hashtags_counted = Counter(hashtags)
most_common = hashtags_counted.most_common(5)
with open('users_with_er.csv', 'a', newline='', encoding='utf-8') as data_out:
print(timestamp,'Writing Data for user {0}...'.format(user))
post_info["USER"] = accountname
post_info["FOLLOWERCOUNT"] = followercount
post_info["MEDIA COUNT"] = media_count
post_info["TOTAL LIKES"] = total_likes
post_info["TOTAL COMMENTS"] = total_comments
post_info["ER"] = engagement_rate
post_info["ER IN %"] = engagement_rate_percentage
post_info["BIO"] = bio
post_info["ALL CAPTION TEXT"] = all_captiontext
post_info["HASHTAGS COUNTED"] = hashtags_counted
csv_writer = csv.DictWriter(data_out, fieldnames=csv_headers)
csv_writer.writerow(post_info)
""" Done with the script """
print('ALL DONE !!!! ')
The code that goes before this simply scrapes the webpage, and compiles all the captions from the last 12 posts into "all_captiontext".
Any help to solve this (probably simple) issue would be greatly appreciated as I've been struggling with this for days (again, I'm a noob :') ).

Replace line
post_info["MOST COMMON HASHTAGS"] = most_common
with:
for i, counter_tuple in enumerate(most_common):
tag_name = counter_tuple[0].replace('#','')
label = "Top %d" % (i + 1)
post_info[label] = tag_name
There's also a bit of code missing. For example, your code doesn't include csv_headers variable, which I suppose would be
csv_headers = post_info.keys()
It also seems that you're opening a file to write just one row. I don't think that's intended, so what you would like to do is to collect the results into a list of dictionaries. A cleaner solution would be to use pandas' dataframe, which you can output straight into a csv file.

most_common being the output of the call to hashtags_counted.most_common, I had a look at the doc here: https://docs.python.org/2/library/collections.html#collections.Counter.most_common
Output if formatted the following : [(key, value), (key, value), ...] and ordered in decreasing importance of number of occurences.
Hence, to get only the name and not the number of occurence, you should replace:
post_info["MOST COMMON HASHTAGS"] = most_common
by
post_info["MOST COMMON HASHTAGS"] = [x[0] for x in most_common]
You have a list of tuple. This statement builds on the fly the list of the first element of each tuple, keeping the sorting order.

Parse data with beatifulsoup using Threads

I have thousands of URLs in a text file, now I want to extract title and price from a product link. I tried to implement threads to do it faster but seems that it's not working correctly, producing duplicate data and executing script too long. Without using threads, the script works as expected.
Here is my code:
import requests
from bs4 import BeautifulSoup
import csv
import lxml
import threading
def runner(fname):
global lck
lck.acquire()
with open(fname, 'r') as f:
for line in f:
r = requests.get(line)
soup = BeautifulSoup(r.content, 'lxml')
try:
title = soup.find('h1', id='itemTitle').text.trim().encode('utf-8')
price = soup.find('span', itemprop='price').text.trim().encode('utf-8')
except:
price = "No price"
with open("Data.csv", 'a', newline='',) as file:
writer = csv.writer(file)
writer.writerow([title, price])
lck.release()
lck = threading.Lock()
fname = "ProductLinks.txt"
threads = []
for i in range(0, 3):
t = threading.Thread(target = runner, args = (fname, ))
threads.append(t)
t.start()
for t in threads:
t.join()
Can someone please guide me, on how to do it correctly, so it can extract and save data parallelly

It is producing duplicate results because when you create the threads you call the same function three times.
t = threading.Thread(target = runner, args = (fname, ))
When you execute the above line, the argument always stays fname which as far as I understand it is always "ProductLinks.txt". Therefore your program will go into runner and there I see that you loop over all the lines of the text.
I suspect that what you want to "parallelise" is exactly that looping over the text lines? Then you would need to write a function parse_line and pass this one into the threading environment.
I would also suggest that you store the values in a dict and export to csv in the end because am not sure if the open environment is thread-safe.
def parse_line(line, result_dict):
r = requests.get(line)
soup = BeautifulSoup(r.content, 'lxml')
try:
title = soup.find('h1', id='itemTitle').text.trim().encode('utf-8')
price = soup.find('span', itemprop='price').text.trim().encode('utf-8')
result_dict[title] = price
except:
result_dict['No title'] = "No price"
Now, say that you have a list with all the lines in your file as strings. You can achieve that by doing the following
file_lines = []
with open(fname, 'r') as f:
for line in f:
file_lines.append(line)
Then you can call this function using Threading over the list of all lines in your file
my_dict = {}
for input_line in file_lines:
t = threading.Thread(target = parse_line, args = (input_line, my_dict))
threads.append(t)
t.start()
Finally you can export your dict to csv using pandas
import pandas as pd
pd.DataFrame(my_dict).to_csv("Data.csv")

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Multithreading Scrape Html and Safely Save to One File - python

Related

Multi threading not processing full list

python script failling to read csvfile with error - StopIteration

Can't write in csv file

Can't figure out how to properly output my data

Parse data with beatifulsoup using Threads

Categories

Resources