I get the following error when trying to parse a large number of web pages from a website : "Reason: 'TypeError("cannot serialize '_io.BufferedReader' object",)'. How can I fix it ?
full error message is :
File "main.py", line 29, in <module>
records = p.map(defs.scrape,state_urls)
File "C:\Users\Utilisateur\Anaconda3\lib\multiprocessing\pool.py", line 266, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "C:\Users\Utilisateur\Anaconda3\lib\multiprocessing\pool.py", line 644, in get
raise self._value
multiprocessing.pool.MaybeEncodingError: Error sending result: '<multiprocessing.pool.ExceptionWithTraceback object at 0x0000018DD1C3D828>'. Reason: 'TypeError("cannot serialize '_io.BufferedReader' object",)'
I browsed through some of the answers for similar questions here, namely this one (multiprocessing.pool.MaybeEncodingError: Error sending result: Reason: 'TypeError("cannot serialize '_io.BufferedReader' object",)') but I don't think I'm running into the same issue, as I don't handle files directly in the scrape function.
I tried modifying the scrape function so it returned a string and not a list (don't know why I did that) but I didn't work.
From the main.py file :
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from multiprocessing import Pool
import codecs
import defs
if __name__ == '__main__':
filename = "some_courts_test.csv"
# not the actual values
courts = ["blabla", "blablabla", "blablabla","blabla"]
client = defs.init_client()
i = 1
# scrapes the data from the website and puts it into a csv file
for court in courts:
records = []
records_string =""
print("creating a file for the court of : "+court)
f = defs.init_court_file(court)
print("generating urls for the court of "+court)
state_urls = defs.generate_state_urls(court)
for url in state_urls:
print(url)
print("scraping creditors from : "+court)
p = Pool(10)
records = p.map(defs.scrape,state_urls)
records_string = ''.join(records[1])
p.terminate()
p.join()
for r in records_string:
f.write(r)
records = []
f.close()
from the defs file:
def scrape(url):
data = []
row_string = ' '
final_data = []
final_string = ' '
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
table = page_soup.find("table", {"class":"table table-striped"})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.replace(',',' ') for ele in cols] #cleans it up
for ele in cols:
if ele:
data.append(ele)
data.append(',')
data.append('\n')
return(data)
Related
When I try to write the information in the csv file, error is thrown:
Traceback (most recent call last):
File "sizeer.py", line 68, in <module>
writer.writerow([name,color,price])
ValueError: I/O operation on closed file
import requests
import csv
from bs4 import BeautifulSoup
proxies = {
"http":"http://195.189.60.97:3128",
"http":"http://103.78.75.165:8080",
"http":"http://212.87.220.2:3128",
"http":"http://88.99.134.61:8080",
"http":"http://103.102.139.178:8080",
"http":"http://218.60.8.83:3129",
"http":"http://124.121.105.193:8888",
"http":"http://198.237.114.54:8080",
"http":"http://36.67.106.58:8080",
"http":"http://35.214.241.28:3128"
}
base_url = ...
page = requests.get(base_url, proxies=proxies)
if page.status_code != 200:
exit("Page wasn't parsed")
soup = BeautifulSoup(page.content, 'lxml')
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
#Get categories
category_wrapper = soup.find_all(class_="m-menu_subItem")
categories = []
for cw in category_wrapper:
anchor = cw.find("a", recursive=False)
categories.append(anchor['href'])
#Iterrate categories
for category in categories:
cat_page = requests.get(base_url + category, proxies=proxies)
cat_soup = BeautifulSoup(cat_page.content, 'lxml')
products_wrapper = cat_soup.find(class_="b-productList")
cat_pagination = products_wrapper.find(class_="m-pagination").find_all("span")
max_page = [int(s) for s in cat_pagination[-1].text.split() if s.isdigit()][0]
#Iterrate category with pagination and get products
for i in range(1, max_page+1):
cat_pagination_page = requests.get(base_url+category+"/?sort=default&limit=60&page="+str(i), proxies=proxies)
cat_pagination_page_soup = BeautifulSoup(cat_pagination_page.content, 'lxml')
product_links = cat_pagination_page_soup.find_all(class_="b-itemList_photoLink")
for link in product_links:
#Get product data
product_page = requests.get(base_url+link['href'], proxies=proxies)
product_soup = BeautifulSoup(product_page.content, 'lxml')
#Get product variations
variations = product_soup.find_all(class_="m-productDescr_colorItem")
#If there are variations
if len(variations) > 0:
for v in variations:
variation_page = requests.get(base_url+v['href'], proxies=proxies)
variation_soup = BeautifulSoup(variation_page.content, 'lxml')
price = variation_soup.find(class_="s-newPrice").text.strip().split(" ")[0]
name = variation_soup.find(class_="m-productDescr_headline").text.strip()
color = v['title']
print(name)
print(color)
print(price)
print("-------------")
#Save in csv
writer.writerow([name,color,price])
print("SCRAPING DONE")
How to keep the file open through the whole script execution ? Or I have to open it every time I am adding content ? EDIT In fact, the file is not even created.
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
The file closes at the end of the with block - that is the block's purpose.
You could put everything inside the block, but that only makes the existing problem worse: the code is reaching several levels of indents, is long and becomes difficult to understand. This is why you use functions to organize the code. For example, if you have the big for loop set in a function:
def do_stuff_with(categories, writer):
for category in categories:
# lots of logic here
# use `writer.writerow` when needed
# Get everything else set up that doesn't need the file, first
categories = ... # do the BeautifulSoup input stuff
# then we can open the file and use the function:
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
do_stuff_with(categories, writer)
Once you have that working, you can probably think of ways to apply the technique further. For example, pull out the innermost logic, for handling the variations for a single product. Or you can have a function to handle the creation of the categories data, and return it.
I've written a script in python to get some tabular content from a webpage and write the same in a csv file. What I wish to do now is let my script write the content in a csv file only if the table (visible as Top Mutual Fund Holders) is available in that page otherwise it will remove the csv file that has been created.
The table is avaible in this webpage.
The table I'm looking for is not available in this webpage.
This is my try:
import os
import csv
import requests
from bs4 import BeautifulSoup
url = "https://finance.yahoo.com/quote/UBER/holders?p=UBER"
def get_mutual_fund(soup):
datalist = []
for items in soup.select_one("h3:contains('Top Mutual Fund Holders')").find_next_sibling().select("table tr"):
data = [item.text for item in items.select("th,td")]
datalist.append(data)
return datalist
def get_records(link):
r = requests.get(link)
soup_obj = BeautifulSoup(r.text,"lxml")
try:
item_one = get_mutual_fund(soup_obj)
except AttributeError:
item_one = ""
if item_one:
writer.writerows(item_one)
else:
os.remove("mutual_fund.csv")
return item_one
if __name__ == '__main__':
with open("mutual_fund.csv","w",newline="") as f:
writer = csv.writer(f)
for elem in get_records(url):
print(elem)
I've tried with the link that do not have that table. However, it throws the following error
while deleting the csv file:
Traceback (most recent call last):
File "C:\Users\WCS\AppData\Local\Programs\Python\Python37-32\demo.py", line 33, in <module>
for elem in get_records(url):
File "C:\Users\WCS\AppData\Local\Programs\Python\Python37-32\demo.py", line 27, in get_records
os.remove("mutual_fund.csv")
PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'mutual_fund.csv'
How can I delete the csv file when the tabular content is not present?
You are actually deleting the file when it is open to write.
You should change your main function accordingly.
def get_records(link):
r = requests.get(link)
soup_obj = BeautifulSoup(r.text,"lxml")
try:
item_one = get_mutual_fund(soup_obj)
except AttributeError:
item_one = None
return item_one
if __name__ == '__main__':
delete_file= False
with open("mutual_fund.csv","w",newline="") as f:
writer = csv.writer(f)
try:
for elem in get_records(url):
print(elem)
except TypeError:
delete_file=True
if delete_file:
os.remove("mutual_fund.csv")
If you keep your existing logic as it is and delete the file when the content within csv is nothing then the following should work:
import os
import csv
import requests
from bs4 import BeautifulSoup
# url = "https://finance.yahoo.com/quote/fb/holders?p=FB"
url = "https://finance.yahoo.com/quote/UBER/holders?p=UBER"
def get_mutual_fund(soup):
datalist = []
for items in soup.select_one("h3:contains('Top Mutual Fund Holders')").find_next_sibling().select("table tr"):
data = [item.text for item in items.select("th,td")]
datalist.append(data)
return datalist
def get_records(link):
r = requests.get(link)
soup_obj = BeautifulSoup(r.text,"lxml")
try:
item_one = get_mutual_fund(soup_obj)
except AttributeError:
item_one = ""
if item_one:
writer.writerows(item_one)
else:
f.close()
os.remove('mutual_fund.csv')
if __name__ == '__main__':
with open("mutual_fund.csv","w",newline="") as f:
writer = csv.writer(f)
get_records(url)
Trying to figure out how to use an if not statement in which I can group three to four words to omit from a CSV file. Towards the bottom of the code, you'll see that I'm stuck at: if ('reddit', 'passwords') not in x:
Any help would be great.
# import libraries
import bs4
from urllib2 import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.reddit.com/r/NHLStreams/comments/71uhwi/game_thread_sabres_at_maple_leafs_730_pm_et/'
# opening up connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
filename = "sportstreams.csv"
f = open(filename, "w")
headers = "Sport Links " + "\n"
f.write(headers)
links = page_soup.select("form a[href]")
for link in links:
href = link["href"]
print(href)
f.write(href + "\n")
with open('sportstreams.csv') as f,open('sstream.csv', "w") as f2:
for x in f:
if ('reddit', 'passwords') not in x: # trying to find multi words to omit
f2.write(x.strip()+'\n')
Use the builtin function all:
if all(t not in x for t in ('reddit', 'passwords')):
Or any:
if not any(t in x for t in ('reddit', 'passwords')):
Here's it is in your context manager:
with open('sportstreams.csv') as f, open('sstream.csv', "w") as f2:
for line in f:
if any(t in line for t in ('reddit', 'passwords')):
# The line contains one of the strings.
continue
else:
# The line contains none of the strings.
f2.write(line.strip() + '\n')
I'm trying to create a text-delimited file containing the data from the "Actions" table on webpages like this one: http://stats.swehockey.se/Game/Events/300978
I would like each line to include the game # (from the end of the URL) and then the text from the line on the table. For example:
300972 | 60:00 | GK Out | OHK | 33. Hudacek, Julius
I haven't been able to get each row to actually separate. I've tried parsing through each row and column, using a list of stripped strings, and searching by different tags, classes, and styles.
Here's what I currently have:
from bs4 import BeautifulSoup
import urllib.request
def createtext():
gamestr = urlstr + "|"
#Find all table lines. Create one pipe-delimited line for each.
aptext = gamestr
for el in soup.find_all('tr'):
playrow = el.find_all('td', 'tdOdd')
for td in playrow:
if(td.find(text=True)) not in ("", None, "\n"):
aptext = aptext + ''.join(td.text) + "|"
aptext = aptext + "\n" + gamestr
#Creates file with Game # as filename and writes the data to the file
currentfile = urlstr + ".txt"
with open(currentfile, "w") as f:
f.write(str(aptext))
#Grabs the HTML file and creates the soup
urlno = 300978
urlstr = str(urlno)
url = ("http://stats.swehockey.se/Game/Events/" + urlstr)
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
pbpdoc = response.read().decode('utf-8')
soup = BeautifulSoup(pbpdoc)
createtext()
Thanks for any help or guidance!
First of all, you don't have to construct the CSV data manually, Python provides a built-in csv module for that.
Then, since you are up to "actions" only, I'd identify the "actions" table and find the events-only rows. This can be done with the help of a filtering function checking the first cell to not be empty:
import csv
from bs4 import BeautifulSoup
import requests
def only_action_rows(tag):
if tag.name == 'tr':
first_cell = tag.find('td', class_='tdOdd')
return first_cell and first_cell.get_text(strip=True)
event_id = 300978
url = "http://stats.swehockey.se/Game/Events/{event_id}".format(event_id=event_id)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
actions_table = soup.find("h2", text="Actions").find_parent("table")
data = [[event_id] + [td.get_text(strip=True) for td in row.find_all('td', class_='tdOdd')]
for row in actions_table.find_all(only_action_rows)]
with open("output.csv", "w") as f:
writer = csv.writer(f)
writer.writerows(data)
Note that I'm using requests here.
import requests
from bs4 import BeautifulSoup
import csv
from urlparse import urljoin
import urllib2
outfile = open("./battingall.csv", "wb")
writer = csv.writer(outfile)
base_url = 'http://www.baseball-reference.com'
player_url = 'http://www.baseball-reference.com/players/'
alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
players = 'shtml'
gamel = '&t=b&year='
game_logs = 'http://www.baseball-reference.com/players/gl.cgi?id='
years = ['2015','2014','2013','2012','2011','2010','2009','2008']
drounders = []
for dround in alphabet:
drounders.append(player_url + dround)
urlz = []
for ab in drounders:
data = requests.get(ab)
soup = BeautifulSoup(data.content)
for link in soup.find_all('a'):
if link.has_attr('href'):
urlz.append(base_url + link['href'])
yent = []
for ant in urlz:
for d in drounders:
for y in years:
if players in ant:
if len(ant) < 60:
if d in ant:
yent.append(game_logs + ant[44:-6] + gamel + y)
for j in yent:
try:
data = requests.get(j)
soup = BeautifulSoup(data.content)
table = soup.find('table', attrs={'id': 'batting_gamelogs'})
tablea = j[52:59]
tableb= soup.find("b", text='Throws:').next_sibling.strip()
tablec= soup.find("b", text='Height:').next_sibling.strip()
tabled= soup.find("b", text='Weight:').next_sibling.strip()
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
list_of_cells.append(tablea)
list_of_cells.append(j[len(j)-4:])
list_of_cells.append(tableb)
list_of_cells.append(tablec)
list_of_cells.append(tabled)
for cell in row.findAll('td'):
text = cell.text.replace(' ', '').encode("utf-8")
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
print list_of_rows
writer.writerows(list_of_rows)
except (AttributeError,NameError):
pass
When I run this code to get gamelog batting data I keep getting an error:
Traceback (most recent call last):
File "battinggamelogs.py", line 44, in <module>
data = requests.get(j)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site- packages/requests/api.py", line 65, in get
return request('get', url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site- packages/requests/api.py", line 49, in request
response = session.request(method=method, url=url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/sessions.py", line 461, in request
resp = self.send(prep, **send_kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/sessions.py", line 573, in send
r = adapter.send(request, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/adapters.py", line 415, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', BadStatusLine("''",))
I need a way to bypass this error to keep going. I think the reason the error comes up because there is no table to get data from.
You can wrap your requests.get() block in a try/except. You need to catch the requests.exceptions.ConnectionError that is being generated.
for ab in drounders:
try:
data = requests.get(ab)
soup = BeautifulSoup(data.content)
for link in soup.find_all('a'):
if link.has_attr('href'):
urlz.append(base_url + link['href'])
except requests.exceptions.ConnectionError:
pass
This is occurring because the connection, itself, has a problem, not because there is no data in the table. You aren't even getting that far.
Note: This is completely eating the exception by simply using pass (as you are also doing later in the code block). It may be better to do something like this:
except requests.exceptions.ConnectionError:
print("Failed to open {}".format(ab))
This will provide you with a message on the console of what URL is failing.