I'm trying to save the outputted social media link to an excel file using openpyxl but getting the following error:
Traceback (most recent call last):
File "/Users/xxxx/_Main_.py", line 40, in <module>
sheet.cell(cell.row, col2).value = ig_get_present
File "/Users/xxxx/venv/lib/python3.10/site-packages/openpyxl/cell/cell.py", line 215, in value
self._bind_value(value)
File "/Users/xxxx/venv/lib/python3.10/site-packages/openpyxl/cell/cell.py", line 184, in _bind_value
raise ValueError("Cannot convert {0!r} to Excel".format(value))
ValueError: Cannot convert ['https://www.instagram.com/xxxx/'] to Excel
This is the code leading to it, no idea why its happening.
column_name = 'URL'
column_name2 = 'Instagram'
headers = [cell.value for cell in sheet[1]]
col = get_column_letter(headers.index(column_name) + 1)
col2 = headers.index(column_name2) + 1
for cell in sheet[col][1:]:
url = cell.value
r = requests.get(url)
ig_get = ['instagram.com']
ig_get_present = []
soup = BeautifulSoup(r.content, 'html5lib')
all_links = soup.find_all('a', href=True)
for ig_get in ig_get:
for link in all_links:
if ig_get in link.attrs['href']:
ig_get_present.append(link.attrs['href'])
sheet.cell(cell.row, col2).value = ig_get_present
Converting the data to a string fixed my issue.
ig_got = str(ig_get_present)
sheet.cell(cell.row, col2).value = ig_got
Related
I've written a script in python to get some tabular content from a webpage and write the same in a csv file. What I wish to do now is let my script write the content in a csv file only if the table (visible as Top Mutual Fund Holders) is available in that page otherwise it will remove the csv file that has been created.
The table is avaible in this webpage.
The table I'm looking for is not available in this webpage.
This is my try:
import os
import csv
import requests
from bs4 import BeautifulSoup
url = "https://finance.yahoo.com/quote/UBER/holders?p=UBER"
def get_mutual_fund(soup):
datalist = []
for items in soup.select_one("h3:contains('Top Mutual Fund Holders')").find_next_sibling().select("table tr"):
data = [item.text for item in items.select("th,td")]
datalist.append(data)
return datalist
def get_records(link):
r = requests.get(link)
soup_obj = BeautifulSoup(r.text,"lxml")
try:
item_one = get_mutual_fund(soup_obj)
except AttributeError:
item_one = ""
if item_one:
writer.writerows(item_one)
else:
os.remove("mutual_fund.csv")
return item_one
if __name__ == '__main__':
with open("mutual_fund.csv","w",newline="") as f:
writer = csv.writer(f)
for elem in get_records(url):
print(elem)
I've tried with the link that do not have that table. However, it throws the following error
while deleting the csv file:
Traceback (most recent call last):
File "C:\Users\WCS\AppData\Local\Programs\Python\Python37-32\demo.py", line 33, in <module>
for elem in get_records(url):
File "C:\Users\WCS\AppData\Local\Programs\Python\Python37-32\demo.py", line 27, in get_records
os.remove("mutual_fund.csv")
PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'mutual_fund.csv'
How can I delete the csv file when the tabular content is not present?
You are actually deleting the file when it is open to write.
You should change your main function accordingly.
def get_records(link):
r = requests.get(link)
soup_obj = BeautifulSoup(r.text,"lxml")
try:
item_one = get_mutual_fund(soup_obj)
except AttributeError:
item_one = None
return item_one
if __name__ == '__main__':
delete_file= False
with open("mutual_fund.csv","w",newline="") as f:
writer = csv.writer(f)
try:
for elem in get_records(url):
print(elem)
except TypeError:
delete_file=True
if delete_file:
os.remove("mutual_fund.csv")
If you keep your existing logic as it is and delete the file when the content within csv is nothing then the following should work:
import os
import csv
import requests
from bs4 import BeautifulSoup
# url = "https://finance.yahoo.com/quote/fb/holders?p=FB"
url = "https://finance.yahoo.com/quote/UBER/holders?p=UBER"
def get_mutual_fund(soup):
datalist = []
for items in soup.select_one("h3:contains('Top Mutual Fund Holders')").find_next_sibling().select("table tr"):
data = [item.text for item in items.select("th,td")]
datalist.append(data)
return datalist
def get_records(link):
r = requests.get(link)
soup_obj = BeautifulSoup(r.text,"lxml")
try:
item_one = get_mutual_fund(soup_obj)
except AttributeError:
item_one = ""
if item_one:
writer.writerows(item_one)
else:
f.close()
os.remove('mutual_fund.csv')
if __name__ == '__main__':
with open("mutual_fund.csv","w",newline="") as f:
writer = csv.writer(f)
get_records(url)
I've been trying to figure out how to scrape baseball box scores from Fangraphs with Python 3.6 and the BeautifulSoup and Pandas modules. My final goal is to save different sections of the webpage to different sheets in Excel.
In order to do this, I think I have to pull each table separately by their respective id tags. This is the code to do so for the four tables (below the graph on the page) that would make up the first excel sheet. Running the code results in this error:
Traceback (most recent call last):
File "Fangraphs Box Score Scraper.py", line 14, in <module>
df1 = pd.read_html(soup,attrs={'id': ['WinsBox1_dghb','WinsBox1_dghp','WinsBox1_dgab','WinsBox1_dgap']})
File "C:\Python36\lib\site-packages\pandas\io\html.py", line 906, in read_html
keep_default_na=keep_default_na)
File "C:\Python36\lib\site-packages\pandas\io\html.py", line 743, in _parse
raise_with_traceback(retained)
File "C:\Python36\lib\site-packages\pandas\compat\__init__.py", line 344, in raise_with_traceback
raise exc.with_traceback(traceback)
TypeError: 'NoneType' object is not callable
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'http://www.fangraphs.com/boxscore.aspx?date=2017-09-10&team=Red%20Sox&dh=0&season=2017'
response = requests.get(url)
soup = BeautifulSoup(response.text,"lxml")
df1 = pd.read_html(soup,attrs={'id': ['WinsBox1_dghb','WinsBox1_dghp','WinsBox1_dgab','WinsBox1_dgap']})
writer = pd.ExcelWriter('Box Scores.xlsx')
df1.to_excel(writer,'Traditional Box Scores')
you use wrong id, you take it form <div> but need take from <table> tags read_html attrs and i think you do not need to use bs, try it:
import pandas as pd
url = 'http://www.fangraphs.com/boxscore.aspx?date=2017-09-10&team=Red%20Sox&dh=0&season=2017'
df1 = pd.read_html(
url,
attrs={'id': ['WinsBox1_dghb_ctl00', 'WinsBox1_dgab_ctl00']}
)
# and now df1 it is list of df
writer = pd.ExcelWriter('Box Scores.xlsx')
row = 0
for df in df1:
df.to_excel(writer, sheet_name='tables', startrow=row , startcol=0)
row = row + len(df.index) + 3
writer.save()
I have a bunch of xlsx files, named from 1 to 169 like '1.xlsx', '2.xlsx' and so on... But while going through for loop, that read that files, the code does not see any rows in the 11th file (nrows in 11th file always is 0, while it is not if you open it manually) and gives me the IndexError (while these files are not empty).
I have no idea of what is going on with that code.
import os, xlwt, xlrd
file_dir = 'docs/'
files = os.listdir(file_dir)
#Open file and, read neaded variables and write them
def r_file(path, file):
workbook = xlrd.open_workbook(path+file)
info_sheet = workbook.sheet_by_index(0)
data_sheet = workbook.sheet_by_index(1)
#cells with company info
print info_sheet.nrows
company_name = info_sheet.cell(3,3).value
company_leg_adress = info_sheet.cell(4,3).value
company_fact_adress = info_sheet.cell(5,3).value
#cells with answers
question_1 = data_sheet.cell(3,10).value
question_1_1 = data_sheet.cell(8,2).value
question_1_2 = data_sheet.cell(13,2).value
question_2 = data_sheet.cell(18,10).value
question_3 = data_sheet.cell(25,10).value
question_3_additional = [data_sheet.cell(nrow,10).value for nrow in range(30,48)]
question_4 = data_sheet.cell(51,2).value
question_5 = data_sheet.cell(56,2).value
#get full row in list
row_as_list = [company_name,company_leg_adress,company_fact_adress, question_1, question_1_1, question_1_2, question_2, question_3, question_4]+question_3_additional
return row_as_list
#write companies in file
def w_file(companies):
wb = xlwt.Workbook()
ws = wb.add_sheet('aggr', cell_overwrite_ok=True)
for company in companies:
row_as_list = r_file(file_dir,str(company)+'.xlsx')
for each_index in row_as_list:
ws.write(company, row_as_list.index(each_index) , each_index)
wb.save('aggregation.xls')
companies_amount = [x for x in range(1,170)]
w_file(companies_amount)
after running it, it returns:
Traceback (most recent call last):
File "/home/ubuntu/workspace/ex50/bin/writing.py", line 44, in <module>
w_file(companies_amount)
File "/home/ubuntu/workspace/ex50/bin/writing.py", line 36, in w_file
row_as_list = r_file(file_dir,str(company)+'.xlsx')
File "/home/ubuntu/workspace/ex50/bin/writing.py", line 13, in r_file
company_name = info_sheet.cell(3,3).value
File "/usr/local/lib/python2.7/dist-packages/xlrd-1.0.0-py2.7.egg/xlrd/sheet.py", line 401, in cell
self._cell_types[rowx][colx],
IndexError: list index out of range
and it makes it only on the 11th file (no matter wich file will be the 11th).
Can you tell me what is going on with that thing?
import requests
from bs4 import BeautifulSoup
import csv
from urlparse import urljoin
import urllib2
outfile = open("./battingall.csv", "wb")
writer = csv.writer(outfile)
base_url = 'http://www.baseball-reference.com'
player_url = 'http://www.baseball-reference.com/players/'
alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
players = 'shtml'
gamel = '&t=b&year='
game_logs = 'http://www.baseball-reference.com/players/gl.cgi?id='
years = ['2015','2014','2013','2012','2011','2010','2009','2008']
drounders = []
for dround in alphabet:
drounders.append(player_url + dround)
urlz = []
for ab in drounders:
data = requests.get(ab)
soup = BeautifulSoup(data.content)
for link in soup.find_all('a'):
if link.has_attr('href'):
urlz.append(base_url + link['href'])
yent = []
for ant in urlz:
for d in drounders:
for y in years:
if players in ant:
if len(ant) < 60:
if d in ant:
yent.append(game_logs + ant[44:-6] + gamel + y)
for j in yent:
try:
data = requests.get(j)
soup = BeautifulSoup(data.content)
table = soup.find('table', attrs={'id': 'batting_gamelogs'})
tablea = j[52:59]
tableb= soup.find("b", text='Throws:').next_sibling.strip()
tablec= soup.find("b", text='Height:').next_sibling.strip()
tabled= soup.find("b", text='Weight:').next_sibling.strip()
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
list_of_cells.append(tablea)
list_of_cells.append(j[len(j)-4:])
list_of_cells.append(tableb)
list_of_cells.append(tablec)
list_of_cells.append(tabled)
for cell in row.findAll('td'):
text = cell.text.replace(' ', '').encode("utf-8")
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
print list_of_rows
writer.writerows(list_of_rows)
except (AttributeError,NameError):
pass
When I run this code to get gamelog batting data I keep getting an error:
Traceback (most recent call last):
File "battinggamelogs.py", line 44, in <module>
data = requests.get(j)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site- packages/requests/api.py", line 65, in get
return request('get', url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site- packages/requests/api.py", line 49, in request
response = session.request(method=method, url=url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/sessions.py", line 461, in request
resp = self.send(prep, **send_kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/sessions.py", line 573, in send
r = adapter.send(request, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/adapters.py", line 415, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', BadStatusLine("''",))
I need a way to bypass this error to keep going. I think the reason the error comes up because there is no table to get data from.
You can wrap your requests.get() block in a try/except. You need to catch the requests.exceptions.ConnectionError that is being generated.
for ab in drounders:
try:
data = requests.get(ab)
soup = BeautifulSoup(data.content)
for link in soup.find_all('a'):
if link.has_attr('href'):
urlz.append(base_url + link['href'])
except requests.exceptions.ConnectionError:
pass
This is occurring because the connection, itself, has a problem, not because there is no data in the table. You aren't even getting that far.
Note: This is completely eating the exception by simply using pass (as you are also doing later in the code block). It may be better to do something like this:
except requests.exceptions.ConnectionError:
print("Failed to open {}".format(ab))
This will provide you with a message on the console of what URL is failing.
The following code works fine on my machine, but it is throwing an error at the line
soup = BeautifulSoup(html)
When it's run on another machine. It's parsing a list of active NBA players off of yahoo sports and storing their names and positions to a text file.
from bs4 import BeautifulSoup
import urllib2
'''
scraping the labeled data from yahoo sports
'''
def scrape(filename):
base_url = "http://sports.yahoo.com/nba/players?type=position&c=NBA&pos="
positions = ['G', 'F', 'C']
players = 0
with open(filename, 'w') as names:
for p in positions:
html = urllib2.urlopen(base_url + p).read()
soup = BeautifulSoup(html) #throws the error!
table = soup.find_all('table')[9]
cells = table.find_all('td')
for i in xrange(4, len(cells) - 1, 3):
names.write(cells[i].find('a').string + '\t' + p + '\n')
players += 1
print "...success! %r players downloaded." % players
The error it throws is:
Traceback (most recent call last):
File "run_me.py", line 9, in <module>
scrapenames.scrape('namelist.txt')
File "/Users/brapse/Downloads/bball/scrapenames.py", line 15, in scrape
soup = BeautifulSoup(html)
File "/usr/local/Cellar/python/2.6.5/lib/python2.6/site-packages/bs4/__init__.py", line 100, in __init__
self._feed()
File "/usr/local/Cellar/python/2.6.5/lib/python2.6/site-packages/bs4/__init__.py", line 113, in _feed
self.builder.feed(self.markup)
File "/usr/local/Cellar/python/2.6.5/lib/python2.6/site-packages/bs4/builder/_htmlparser.py", line 46, in feed
super(HTMLParserTreeBuilder, self).feed(markup)
File "/usr/local/Cellar/python/2.6.5/lib/python2.6/HTMLParser.py", line 108, in feed
self.goahead(0)
File "/usr/local/Cellar/python/2.6.5/lib/python2.6/HTMLParser.py", line 171, in goahead
self.handle_charref(name)
File "/usr/local/Cellar/python/2.6.5/lib/python2.6/site-packages/bs4/builder/_htmlparser.py", line 58, in handle_charref
self.handle_data(unichr(int(name)))
ValueError: invalid literal for int() with base 10: 'xBB'
I believe it is a bug in the BS4 htmlparser code, it would crash on the » entity (stands for »), thinking that it should be in decimal. I suggest you update BeautifulSoup on that machine.