Parse data with beatifulsoup using Threads - python

I have thousands of URLs in a text file, now I want to extract title and price from a product link. I tried to implement threads to do it faster but seems that it's not working correctly, producing duplicate data and executing script too long. Without using threads, the script works as expected.
Here is my code:
import requests
from bs4 import BeautifulSoup
import csv
import lxml
import threading
def runner(fname):
global lck
lck.acquire()
with open(fname, 'r') as f:
for line in f:
r = requests.get(line)
soup = BeautifulSoup(r.content, 'lxml')
try:
title = soup.find('h1', id='itemTitle').text.trim().encode('utf-8')
price = soup.find('span', itemprop='price').text.trim().encode('utf-8')
except:
price = "No price"
with open("Data.csv", 'a', newline='',) as file:
writer = csv.writer(file)
writer.writerow([title, price])
lck.release()
lck = threading.Lock()
fname = "ProductLinks.txt"
threads = []
for i in range(0, 3):
t = threading.Thread(target = runner, args = (fname, ))
threads.append(t)
t.start()
for t in threads:
t.join()
Can someone please guide me, on how to do it correctly, so it can extract and save data parallelly

It is producing duplicate results because when you create the threads you call the same function three times.
t = threading.Thread(target = runner, args = (fname, ))
When you execute the above line, the argument always stays fname which as far as I understand it is always "ProductLinks.txt". Therefore your program will go into runner and there I see that you loop over all the lines of the text.
I suspect that what you want to "parallelise" is exactly that looping over the text lines? Then you would need to write a function parse_line and pass this one into the threading environment.
I would also suggest that you store the values in a dict and export to csv in the end because am not sure if the open environment is thread-safe.
def parse_line(line, result_dict):
r = requests.get(line)
soup = BeautifulSoup(r.content, 'lxml')
try:
title = soup.find('h1', id='itemTitle').text.trim().encode('utf-8')
price = soup.find('span', itemprop='price').text.trim().encode('utf-8')
result_dict[title] = price
except:
result_dict['No title'] = "No price"
Now, say that you have a list with all the lines in your file as strings. You can achieve that by doing the following
file_lines = []
with open(fname, 'r') as f:
for line in f:
file_lines.append(line)
Then you can call this function using Threading over the list of all lines in your file
my_dict = {}
for input_line in file_lines:
t = threading.Thread(target = parse_line, args = (input_line, my_dict))
threads.append(t)
t.start()
Finally you can export your dict to csv using pandas
import pandas as pd
pd.DataFrame(my_dict).to_csv("Data.csv")

Related

Multi threading not processing full list

I am using multi-threading to visit links read from a csv, strangely irrespective of the max-workers or even when I remove the multi-threading part, the code runs for an arbitrarily lower number of urls than in the list. I print the list to verify the count. For e.g if the list has 5000 urls, the code stops at 4084, if the links are 13,000 it will stop at 9200, even when it is just 130 links it will stop at 80 or something. What am I doing wrong here?
import requests
import xlrd
import concurrent.futures
from bs4 import BeautifulSoup
import csv
header_added = False
file_location = "Urls.xlsx"
workbook = xlrd.open_workbook(file_location)
sheet = workbook.sheet_by_index(0)
all_links = []
for row in range(1, 11000):
all_links.append(sheet.cell_value(row,0))
print(len(all_links))
i = 0
def get_solution(url):
global header_added, i
page = requests.get(url).text
soup = BeautifulSoup(page, 'html.parser')
ques_div = soup.find('p', class_='header-description')
ques = ques_div.find('span').text
ans_divs = soup.findAll('div', class_='puzzle-solution')
ans = ans_divs[0].text
print("Solution ", i)
i += 1
dict1 ={"Words": ques, "Solution": ans}
with open('Results10k.csv', 'a+', encoding='utf-8') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
result = executor.map(get_solution, all_links)
Here's a reworking of your code that doesn't need locks – instead, there's only ever one process that writes to the file.
Also, due to the GIL, using a ThreadPool will be slower than a process-backed Pool.
import csv
import multiprocessing
import requests
import xlrd
from bs4 import BeautifulSoup
sess = requests.Session()
def get_solution(url):
try:
resp = sess.get(url)
resp.raise_for_status()
page = resp.text
soup = BeautifulSoup(page, "html.parser")
ques_div = soup.find("p", class_="header-description")
ques = ques_div.find("span").text.strip()
ans_divs = soup.findAll("div", class_="puzzle-solution")
ans = ans_divs[0].text.strip()
return {"URL": url, "Words": ques, "Solution": ans, "Error": ""}
except Exception as exc:
print(url, "Error:", exc)
return {"URL": url, "Words": "", "Solution": "", "Error": str(exc)}
def read_links(file_location):
workbook = xlrd.open_workbook(file_location)
sheet = workbook.sheet_by_index(0)
all_links = []
for row in range(1, 11000):
all_links.append(sheet.cell_value(row, 0))
return all_links
def main():
links = read_links("./Urls.xlsx")
with open("Results10k.csv", "w", encoding="utf-8") as f:
with multiprocessing.Pool() as p: # (or multiprocessing.pool.ThreadPool)
for i, result in enumerate(p.imap_unordered(get_solution, links, chunksize=16)):
if i == 0:
writer = csv.DictWriter(f, result.keys())
writer.writeheader()
writer.writerow(result)
f.flush() # Ensure changes are written immediately
if i % 100 == 0: # Progress indicator
print(i)
if __name__ == "__main__":
main()
It could be, that get_solution() crashes for some of the URLs. You could add a try/except in the body of the function and write all crashed URLS to a different file.
def get_solution(url):
try:
...
except:
with open('errors.txt','a+') as f:
f.write(url+'\n')
If this is the problem the numbers should add up to the total number.
Also open() is probably not thread safe.
file_lock = threading.Lock()
def get_solution(url):
with file_lock:
with open('Results10k.csv', 'a+', encoding='utf-8') as f:
w = csv.DictWriter(f, dict1.keys())
...

Can't write in csv file

When I try to write the information in the csv file, error is thrown:
Traceback (most recent call last):
File "sizeer.py", line 68, in <module>
writer.writerow([name,color,price])
ValueError: I/O operation on closed file
import requests
import csv
from bs4 import BeautifulSoup
proxies = {
"http":"http://195.189.60.97:3128",
"http":"http://103.78.75.165:8080",
"http":"http://212.87.220.2:3128",
"http":"http://88.99.134.61:8080",
"http":"http://103.102.139.178:8080",
"http":"http://218.60.8.83:3129",
"http":"http://124.121.105.193:8888",
"http":"http://198.237.114.54:8080",
"http":"http://36.67.106.58:8080",
"http":"http://35.214.241.28:3128"
}
base_url = ...
page = requests.get(base_url, proxies=proxies)
if page.status_code != 200:
exit("Page wasn't parsed")
soup = BeautifulSoup(page.content, 'lxml')
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
#Get categories
category_wrapper = soup.find_all(class_="m-menu_subItem")
categories = []
for cw in category_wrapper:
anchor = cw.find("a", recursive=False)
categories.append(anchor['href'])
#Iterrate categories
for category in categories:
cat_page = requests.get(base_url + category, proxies=proxies)
cat_soup = BeautifulSoup(cat_page.content, 'lxml')
products_wrapper = cat_soup.find(class_="b-productList")
cat_pagination = products_wrapper.find(class_="m-pagination").find_all("span")
max_page = [int(s) for s in cat_pagination[-1].text.split() if s.isdigit()][0]
#Iterrate category with pagination and get products
for i in range(1, max_page+1):
cat_pagination_page = requests.get(base_url+category+"/?sort=default&limit=60&page="+str(i), proxies=proxies)
cat_pagination_page_soup = BeautifulSoup(cat_pagination_page.content, 'lxml')
product_links = cat_pagination_page_soup.find_all(class_="b-itemList_photoLink")
for link in product_links:
#Get product data
product_page = requests.get(base_url+link['href'], proxies=proxies)
product_soup = BeautifulSoup(product_page.content, 'lxml')
#Get product variations
variations = product_soup.find_all(class_="m-productDescr_colorItem")
#If there are variations
if len(variations) > 0:
for v in variations:
variation_page = requests.get(base_url+v['href'], proxies=proxies)
variation_soup = BeautifulSoup(variation_page.content, 'lxml')
price = variation_soup.find(class_="s-newPrice").text.strip().split(" ")[0]
name = variation_soup.find(class_="m-productDescr_headline").text.strip()
color = v['title']
print(name)
print(color)
print(price)
print("-------------")
#Save in csv
writer.writerow([name,color,price])
print("SCRAPING DONE")
How to keep the file open through the whole script execution ? Or I have to open it every time I am adding content ? EDIT In fact, the file is not even created.
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
The file closes at the end of the with block - that is the block's purpose.
You could put everything inside the block, but that only makes the existing problem worse: the code is reaching several levels of indents, is long and becomes difficult to understand. This is why you use functions to organize the code. For example, if you have the big for loop set in a function:
def do_stuff_with(categories, writer):
for category in categories:
# lots of logic here
# use `writer.writerow` when needed
# Get everything else set up that doesn't need the file, first
categories = ... # do the BeautifulSoup input stuff
# then we can open the file and use the function:
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
do_stuff_with(categories, writer)
Once you have that working, you can probably think of ways to apply the technique further. For example, pull out the innermost logic, for handling the variations for a single product. Or you can have a function to handle the creation of the categories data, and return it.

How to delete data from csv file using python

I am scraping a website for the course number and the course name. But if a course number does not have a name or vice versa, the data should be skipped from the final output. I do not know how to do that.
from bs4 import BeautifulSoup
from urllib import urlopen
import csv
source = urlopen('https://www.rit.edu/study/computing-security-bs')
csv_file1 = open('scrape.csv', 'w')
csv_writer = csv.writer(csv_file1)
csv_writer.writerow(['Course Number', 'Course Name'])
soup = BeautifulSoup(source, 'lxml')
table = soup.find('div', class_='processed-table')
#print(table)
curriculum = table.find('curriculum')
#print(curriculum.prettify())
next = curriculum.find('table', class_='table-curriculum')
#print(next.prettify())
for course_num in next.find_all('tr', class_='hidden-row rows-1'):
num = course_num.find_all('td')[0]
real = num.get_text()
# print(real)
realstr = real.encode('utf-8')
name = course_num.find('div', class_='course-name')
realname = name.get_text()
# print(realname)
realnamestr = realname.encode('utf-8')
csv_writer.writerow([realstr, realnamestr])
csv_file1.close()
This is my csv
csv
I want to get rid of the last 4 rows.
As #zvone suggested, a continue will do the job here. Writing this answer as you mentioned you are not aware of the keyword.
Before, csv_writer.writerow([realstr, realnamestr]) just put an if to check the realstr and continue:
if realstr.stip() == "":
continue
I think you should still go through the continue, break and else keywords and how they can be helpful in controlling your loops.
Another approach would be to put data into csv_writer only when realstr has some value. So:
if realstr.strip != "":
csv_writer.writerow([realstr, realnamestr])

Multithreading Scrape Html and Safely Save to One File

I want scrape the title from given url in multiple thread (example in 5 thread)
and save them to one text file. how to do it and how to make sure I safely save the output to one file?
this is my code:
import csv
import requests
requests.packages.urllib3.disable_warnings()
urls = []
with open('Input.csv') as csvDataFile:
csvReader = csv.reader(csvDataFile)
for row in csvReader:
urls.append(row[1])
def find_between( s, first, last ):
try:
start = s.index( first ) + len( first )
end = s.index( last, start )
return s[start:end]
except ValueError:
return ""
def get_title( url ):
try:
r = requests.get(url)
html_content = r.text.encode('UTF-8')
title = find_between(html_content , "<title>", "</title>")
return title
except:
return ""
for url in urls:
f = open('myfile.txt', 'a')
f.write(get_title(url) + '\n')
f.close()
try to use futures
1. create pool
2. sumbit function and parameters
3. get result from function
import csv
from concurrent import futures
pool = futures.ThreadPoolExecutor(5)
workers = [pool.sumbit(get_title,url) for url in urls]
while not all(worker.done() for worker in workers):
pass
with open(file) as f:
w = csv.writer(f)
w.writerows([[worker.result()] for worker in workers])

beautifulsoup to csv: putting paragraph of text into one line

I have a bunch of web text that I'd like to scrape and export to a csv file. The problem is that the text is split over multiple lines on the website and that's how beautifulsoup reads it. When I export to csv, all the text goes into one cell but the cell has multiple lines of text. When I try to read the csv into another program, it interprets the multiple lines in a way that yields a nonsensical dataset. The question is, how do I put all the text into a single line after I pull it with beautifulsoup but before I export to csv?
Here's a simple working example demonstrating the problem of multiple lines (in fact, the first few lines in the resulting csv are blank, so at first glance it may look empty):
import csv
import requests
from bs4 import BeautifulSoup
def main():
r = requests.get("https://www.econometricsociety.org/publications/econometrica/2017/03/01/search-yield")
soup = BeautifulSoup(r.text,"html.parser")
with open('Temp.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.writer(f,delimiter=",")
abstract=soup.find("article").text
writer.writerow([abstract])
if __name__ == '__main__':
main()
UPDATE: there have been some good suggestions, but it's still not working. The following code still produces a csv file with line breaks in a cell:
import csv
import requests
from bs4 import BeautifulSoup
with open('Temp.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.writer(f,delimiter=',')
r = requests.get("https://www.econometricsociety.org/publications/econometrica/2017/03/01/search-yield")
soup = BeautifulSoup(r.text,'lxml')
find_article = soup.find('article')
find_2para = find_article.p.find_next_sibling("p")
find_largetxt = find_article.p.find_next_sibling("p").nextSibling
writer.writerow([find_2para,find_largetxt])
Here's another attempt based on a different suggestion. This one also ends up producing a line break in the csv file:
import csv
import requests
from bs4 import BeautifulSoup
def main():
r = requests.get("https://www.econometricsociety.org/publications/econometrica/2017/03/01/search-yield")
soup = BeautifulSoup(r.text,"html.parser")
with open('Temp.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.writer(f,delimiter=",")
abstract=soup.find("article").get_text(separator=" ", strip=True)
writer.writerow([abstract])
if __name__ == '__main__':
main()
Change your abstract = ... line into:
abstract = soup.find("article").get_text(separator=" ", strip=True)
It'll separate each line using the separator parameter (in this case It'll separate the strings with an empty space.
The solution that ended up working for me is pretty simple:
abstract=soup.find("article").text.replace("\t", "").replace("\r", "").replace("\n", "")
That gets rid of all line breaks.
r = requests.get("https://www.econometricsociety.org/publications/econometrica/2017/03/01/search-yield")
soup = BeautifulSoup(r.text,'lxml') # I prefer using xml parser
find_article = soup.find('article')
# Next line how to find The title in this case: Econometrica: Mar 2017, Volume 85, Issue 2
find_title = find_article.h3
# find search yeild
find_yeild = find_article.h1
#first_paragraph example : DOI: 10.3982/ECTA14057 p. 351-378
find_1para = find_article.p
#second p example : David Martinez‐Miera, Rafael Repullo
find_2para = find_article.p.find_next_sibling("p")
#find the large text area using e.g. 'We present a model of the relationship bet...'
find_largetxt = find_article.p.find_next_sibling("p").nextSibling
I used a variety of methods of getting to the text area you wish just for the purpose of education(you can use .text on each of these to get the text without tags or you can use Zroq's method.
But you can write each one of these into the file by doing for example
writer.writerow(find_title.text)

Categories

Resources