Unable to delete some csv file conditionally - python

I've written a script in python to get some tabular content from a webpage and write the same in a csv file. What I wish to do now is let my script write the content in a csv file only if the table (visible as Top Mutual Fund Holders) is available in that page otherwise it will remove the csv file that has been created.
The table is avaible in this webpage.
The table I'm looking for is not available in this webpage.
This is my try:
import os
import csv
import requests
from bs4 import BeautifulSoup
url = "https://finance.yahoo.com/quote/UBER/holders?p=UBER"
def get_mutual_fund(soup):
datalist = []
for items in soup.select_one("h3:contains('Top Mutual Fund Holders')").find_next_sibling().select("table tr"):
data = [item.text for item in items.select("th,td")]
datalist.append(data)
return datalist
def get_records(link):
r = requests.get(link)
soup_obj = BeautifulSoup(r.text,"lxml")
try:
item_one = get_mutual_fund(soup_obj)
except AttributeError:
item_one = ""
if item_one:
writer.writerows(item_one)
else:
os.remove("mutual_fund.csv")
return item_one
if __name__ == '__main__':
with open("mutual_fund.csv","w",newline="") as f:
writer = csv.writer(f)
for elem in get_records(url):
print(elem)
I've tried with the link that do not have that table. However, it throws the following error
while deleting the csv file:
Traceback (most recent call last):
File "C:\Users\WCS\AppData\Local\Programs\Python\Python37-32\demo.py", line 33, in <module>
for elem in get_records(url):
File "C:\Users\WCS\AppData\Local\Programs\Python\Python37-32\demo.py", line 27, in get_records
os.remove("mutual_fund.csv")
PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'mutual_fund.csv'
How can I delete the csv file when the tabular content is not present?

You are actually deleting the file when it is open to write.
You should change your main function accordingly.
def get_records(link):
r = requests.get(link)
soup_obj = BeautifulSoup(r.text,"lxml")
try:
item_one = get_mutual_fund(soup_obj)
except AttributeError:
item_one = None
return item_one
if __name__ == '__main__':
delete_file= False
with open("mutual_fund.csv","w",newline="") as f:
writer = csv.writer(f)
try:
for elem in get_records(url):
print(elem)
except TypeError:
delete_file=True
if delete_file:
os.remove("mutual_fund.csv")

If you keep your existing logic as it is and delete the file when the content within csv is nothing then the following should work:
import os
import csv
import requests
from bs4 import BeautifulSoup
# url = "https://finance.yahoo.com/quote/fb/holders?p=FB"
url = "https://finance.yahoo.com/quote/UBER/holders?p=UBER"
def get_mutual_fund(soup):
datalist = []
for items in soup.select_one("h3:contains('Top Mutual Fund Holders')").find_next_sibling().select("table tr"):
data = [item.text for item in items.select("th,td")]
datalist.append(data)
return datalist
def get_records(link):
r = requests.get(link)
soup_obj = BeautifulSoup(r.text,"lxml")
try:
item_one = get_mutual_fund(soup_obj)
except AttributeError:
item_one = ""
if item_one:
writer.writerows(item_one)
else:
f.close()
os.remove('mutual_fund.csv')
if __name__ == '__main__':
with open("mutual_fund.csv","w",newline="") as f:
writer = csv.writer(f)
get_records(url)

Related

Multi threading not processing full list

I am using multi-threading to visit links read from a csv, strangely irrespective of the max-workers or even when I remove the multi-threading part, the code runs for an arbitrarily lower number of urls than in the list. I print the list to verify the count. For e.g if the list has 5000 urls, the code stops at 4084, if the links are 13,000 it will stop at 9200, even when it is just 130 links it will stop at 80 or something. What am I doing wrong here?
import requests
import xlrd
import concurrent.futures
from bs4 import BeautifulSoup
import csv
header_added = False
file_location = "Urls.xlsx"
workbook = xlrd.open_workbook(file_location)
sheet = workbook.sheet_by_index(0)
all_links = []
for row in range(1, 11000):
all_links.append(sheet.cell_value(row,0))
print(len(all_links))
i = 0
def get_solution(url):
global header_added, i
page = requests.get(url).text
soup = BeautifulSoup(page, 'html.parser')
ques_div = soup.find('p', class_='header-description')
ques = ques_div.find('span').text
ans_divs = soup.findAll('div', class_='puzzle-solution')
ans = ans_divs[0].text
print("Solution ", i)
i += 1
dict1 ={"Words": ques, "Solution": ans}
with open('Results10k.csv', 'a+', encoding='utf-8') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
result = executor.map(get_solution, all_links)
Here's a reworking of your code that doesn't need locks – instead, there's only ever one process that writes to the file.
Also, due to the GIL, using a ThreadPool will be slower than a process-backed Pool.
import csv
import multiprocessing
import requests
import xlrd
from bs4 import BeautifulSoup
sess = requests.Session()
def get_solution(url):
try:
resp = sess.get(url)
resp.raise_for_status()
page = resp.text
soup = BeautifulSoup(page, "html.parser")
ques_div = soup.find("p", class_="header-description")
ques = ques_div.find("span").text.strip()
ans_divs = soup.findAll("div", class_="puzzle-solution")
ans = ans_divs[0].text.strip()
return {"URL": url, "Words": ques, "Solution": ans, "Error": ""}
except Exception as exc:
print(url, "Error:", exc)
return {"URL": url, "Words": "", "Solution": "", "Error": str(exc)}
def read_links(file_location):
workbook = xlrd.open_workbook(file_location)
sheet = workbook.sheet_by_index(0)
all_links = []
for row in range(1, 11000):
all_links.append(sheet.cell_value(row, 0))
return all_links
def main():
links = read_links("./Urls.xlsx")
with open("Results10k.csv", "w", encoding="utf-8") as f:
with multiprocessing.Pool() as p: # (or multiprocessing.pool.ThreadPool)
for i, result in enumerate(p.imap_unordered(get_solution, links, chunksize=16)):
if i == 0:
writer = csv.DictWriter(f, result.keys())
writer.writeheader()
writer.writerow(result)
f.flush() # Ensure changes are written immediately
if i % 100 == 0: # Progress indicator
print(i)
if __name__ == "__main__":
main()
It could be, that get_solution() crashes for some of the URLs. You could add a try/except in the body of the function and write all crashed URLS to a different file.
def get_solution(url):
try:
...
except:
with open('errors.txt','a+') as f:
f.write(url+'\n')
If this is the problem the numbers should add up to the total number.
Also open() is probably not thread safe.
file_lock = threading.Lock()
def get_solution(url):
with file_lock:
with open('Results10k.csv', 'a+', encoding='utf-8') as f:
w = csv.DictWriter(f, dict1.keys())
...

Can't write in csv file

When I try to write the information in the csv file, error is thrown:
Traceback (most recent call last):
File "sizeer.py", line 68, in <module>
writer.writerow([name,color,price])
ValueError: I/O operation on closed file
import requests
import csv
from bs4 import BeautifulSoup
proxies = {
"http":"http://195.189.60.97:3128",
"http":"http://103.78.75.165:8080",
"http":"http://212.87.220.2:3128",
"http":"http://88.99.134.61:8080",
"http":"http://103.102.139.178:8080",
"http":"http://218.60.8.83:3129",
"http":"http://124.121.105.193:8888",
"http":"http://198.237.114.54:8080",
"http":"http://36.67.106.58:8080",
"http":"http://35.214.241.28:3128"
}
base_url = ...
page = requests.get(base_url, proxies=proxies)
if page.status_code != 200:
exit("Page wasn't parsed")
soup = BeautifulSoup(page.content, 'lxml')
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
#Get categories
category_wrapper = soup.find_all(class_="m-menu_subItem")
categories = []
for cw in category_wrapper:
anchor = cw.find("a", recursive=False)
categories.append(anchor['href'])
#Iterrate categories
for category in categories:
cat_page = requests.get(base_url + category, proxies=proxies)
cat_soup = BeautifulSoup(cat_page.content, 'lxml')
products_wrapper = cat_soup.find(class_="b-productList")
cat_pagination = products_wrapper.find(class_="m-pagination").find_all("span")
max_page = [int(s) for s in cat_pagination[-1].text.split() if s.isdigit()][0]
#Iterrate category with pagination and get products
for i in range(1, max_page+1):
cat_pagination_page = requests.get(base_url+category+"/?sort=default&limit=60&page="+str(i), proxies=proxies)
cat_pagination_page_soup = BeautifulSoup(cat_pagination_page.content, 'lxml')
product_links = cat_pagination_page_soup.find_all(class_="b-itemList_photoLink")
for link in product_links:
#Get product data
product_page = requests.get(base_url+link['href'], proxies=proxies)
product_soup = BeautifulSoup(product_page.content, 'lxml')
#Get product variations
variations = product_soup.find_all(class_="m-productDescr_colorItem")
#If there are variations
if len(variations) > 0:
for v in variations:
variation_page = requests.get(base_url+v['href'], proxies=proxies)
variation_soup = BeautifulSoup(variation_page.content, 'lxml')
price = variation_soup.find(class_="s-newPrice").text.strip().split(" ")[0]
name = variation_soup.find(class_="m-productDescr_headline").text.strip()
color = v['title']
print(name)
print(color)
print(price)
print("-------------")
#Save in csv
writer.writerow([name,color,price])
print("SCRAPING DONE")
How to keep the file open through the whole script execution ? Or I have to open it every time I am adding content ? EDIT In fact, the file is not even created.
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
The file closes at the end of the with block - that is the block's purpose.
You could put everything inside the block, but that only makes the existing problem worse: the code is reaching several levels of indents, is long and becomes difficult to understand. This is why you use functions to organize the code. For example, if you have the big for loop set in a function:
def do_stuff_with(categories, writer):
for category in categories:
# lots of logic here
# use `writer.writerow` when needed
# Get everything else set up that doesn't need the file, first
categories = ... # do the BeautifulSoup input stuff
# then we can open the file and use the function:
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
do_stuff_with(categories, writer)
Once you have that working, you can probably think of ways to apply the technique further. For example, pull out the innermost logic, for handling the variations for a single product. Or you can have a function to handle the creation of the categories data, and return it.

Iteration Url Load from CSV in Python

Please Help me
I have a data url in the CSV file, in that file there are 100 rows and 1 column,
I want to load data line 1 to line 100 from CSV using Python, how do I write the code line?
However, after running the repetition can only work once in one of the lines does not reach the end of the url in the CSV and does not continue to the next URL.
disc_information = html.find('div', class_='alert alert-info global-promo').text.strip().strip('\n')
AttributeError: 'NoneType' object has no attribute 'text'
how do I get through if an error occurs when html is not found?
the following line of code I use python, please help so that the looping scrape runs to the end of the url list
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv
import pandas
with open('Url Torch.csv','rt') as f:
data = csv.reader(f, delimiter=',')
for row in data:
URL_GO = row[2]
def variable_Scrape(url):
try:
cookies = dict(cookie="............")
request = requests.get(url, cookies=cookies)
html = BeautifulSoup(request.content, 'html.parser')
title = html.find('div', class_='title').text.strip().strip('\n')
desc = html.find('div', class_='content').text
link = html.find_all('img', class_='lazyload slide-item owl-lazy')
normal_price = html.find('div', class_='amount public').text.strip().strip('\n')
disc_information = html.find('div', class_='alert alert-info global-promo').text.strip().strip('\n')
except AttributeError as e:
print(e)
#ConnectionAbortedError
return False
else:
print(title)
#print(desc)
#print(link)
finally:
print(title)
print(desc)
print(link)
print('Finally.....')
variable_Scrape(URL_GO)
Is hard to give you the exact answer without seeing you csv file but try this:
import csv
f = open('you_file.csv')
csv_f = csv.reader(f)
for row in csv_f:
print row[0]
This is the code
import csv
data = [] #create an empty list to store rows on it
with open('emails.csv') as csv_file:
reader = csv.reader(csv_file)
for row in reader:
data.append(row) #add each row to the list
Based on your comments about passing a loop when the url is not ok:
for url in data: # data is the list where url stored
try:
# do your code here (requests, beautifulsoup) :
# r = requests.get(url) ...
except:
pass
# will go to the next loop (next url) if an error happens

How to fix "cannot serialize '_io.BufferedReader' object" while multiprocessing?

I get the following error when trying to parse a large number of web pages from a website : "Reason: 'TypeError("cannot serialize '_io.BufferedReader' object",)'. How can I fix it ?
full error message is :
File "main.py", line 29, in <module>
records = p.map(defs.scrape,state_urls)
File "C:\Users\Utilisateur\Anaconda3\lib\multiprocessing\pool.py", line 266, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "C:\Users\Utilisateur\Anaconda3\lib\multiprocessing\pool.py", line 644, in get
raise self._value
multiprocessing.pool.MaybeEncodingError: Error sending result: '<multiprocessing.pool.ExceptionWithTraceback object at 0x0000018DD1C3D828>'. Reason: 'TypeError("cannot serialize '_io.BufferedReader' object",)'
I browsed through some of the answers for similar questions here, namely this one (multiprocessing.pool.MaybeEncodingError: Error sending result: Reason: 'TypeError("cannot serialize '_io.BufferedReader' object",)') but I don't think I'm running into the same issue, as I don't handle files directly in the scrape function.
I tried modifying the scrape function so it returned a string and not a list (don't know why I did that) but I didn't work.
From the main.py file :
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from multiprocessing import Pool
import codecs
import defs
if __name__ == '__main__':
filename = "some_courts_test.csv"
# not the actual values
courts = ["blabla", "blablabla", "blablabla","blabla"]
client = defs.init_client()
i = 1
# scrapes the data from the website and puts it into a csv file
for court in courts:
records = []
records_string =""
print("creating a file for the court of : "+court)
f = defs.init_court_file(court)
print("generating urls for the court of "+court)
state_urls = defs.generate_state_urls(court)
for url in state_urls:
print(url)
print("scraping creditors from : "+court)
p = Pool(10)
records = p.map(defs.scrape,state_urls)
records_string = ''.join(records[1])
p.terminate()
p.join()
for r in records_string:
f.write(r)
records = []
f.close()
from the defs file:
def scrape(url):
data = []
row_string = ' '
final_data = []
final_string = ' '
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
table = page_soup.find("table", {"class":"table table-striped"})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.replace(',',' ') for ele in cols] #cleans it up
for ele in cols:
if ele:
data.append(ele)
data.append(',')
data.append('\n')
return(data)

Writing scraped data to csv

I've been stuck trying to transfer the data that I scraped to a csv file. Here is my code:
import requests, bs4, csv, sys
reload(sys)
sys.setdefaultencoding('utf-8')
url = 'http://www.constructeursdefrance.com/resultat/?dpt=01'
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text,'html.parser')
links = []
for div in soup.select('.link'):
link = div.a.get('href')
links.append(link)
for i in links:
url2 = i
res2 = requests.get(url2)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
data = each.text, each.next_sibling
with open('french.csv', 'wb') as file:
writer = csv.writer(file)
writer.writerows(data)
the output says:
Traceback (most recent call last):
File "test_new_project.py", line 23, in <module>
writer.writerows(data)
csv.Error: sequence expected
But I am trying to put tuples into the csv file, and as long as I know csv accepts tuples and lists. How can I fix this problem?
Atirag is correct, but you have another issue which is that your with call to open the output file is nested within a for loop. So if there is more than one link, the file will be overwritten each time and the output will not be what you expect. I think this should generate the output you intend:
for div in soup.select('.link'):
link = div.a.get('href')
links.append(link)
with open("french.csv", "w") as file:
writer = csv.writer(file)
for i in links:
res2 = requests.get(i)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
writer.writerow([each.text, each.next_sibling])
Change this
for each in soup2.select('li > strong'):
data = each.text, each.next_sibling
to this
data=[]
for each in soup2.select('li > strong'):
data.append((each.text, each.next_sibling))
your data variable was one tuple not a list of tuples. The above code creates a list of tuples.
Other solution is this (mind the indentation)
data = []
for i in links:
url2 = i
res2 = requests.get(url2)
soup2 = bs4.BeautifulSoup(res2.text, 'html.parser')
for each in soup2.select('li > strong'):
data.append((each.text, each.next_sibling))
with open('french.csv', 'wb') as file:
writer = csv.writer(file)
writer.writerows(data)

Categories

Resources