How to do nested link scraping using BeautifulSoup and requests

How to do nested link scraping using BeautifulSoup and requests - python

I am trying to scrape the texts of all of the episodes of all of the TV Series in a webpage. The entire thing is nested hence it goes through 3 webpages before finding the list of links.It is showing some error which I have pasted below.
import requests
import bs4 as bs
urls='http://dl5.lavinmovie.net/Series/'
url=requests.get(urls).text
soup=bs.BeautifulSoup(url,'lxml')
title=soup.find_all('a')
ur=[""]
names=[""]
season=[""]
quality=[""]
for i in title:
# names.append(i.text)
urlss=urls+i.text+"/"
urla=requests.get(urls).text
soupp=bs.BeautifulSoup(urla,'lxml')
ur=soupp.find_all('a')
for i in ur:
# names.append(i.text)
urls=urls+i.text+"/"
urla=requests.get(urls).text
soupp=bs.BeautifulSoup(urla,'lxml')
ur=soupp.find_all('a')
for i in ur:
# quality.append(i.text)
urls=urls+i.text+"/"
urla=requests.get(urls).text
soupp=bs.BeautifulSoup(urla,'lxml')
ur=soupp.find_all('a')
for i in ur:
print(i.text)
Traceback (most recent call last):
File "C:\Users\Vedant Mamgain\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 603, in urlopen
chunked=chunked)
File "C:\Users\Vedant Mamgain\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 387, in _make_request
six.raise_from(e, None)
File "<string>", line 2, in raise_from
File "C:\Users\Vedant Mamgain\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py", line 383, in _make_request
httplib_response = conn.getresponse()
File "C:\Users\Vedant Mamgain\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 1321, in getresponse
response.begin()
File "C:\Users\Vedant Mamgain\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 296, in begin
version, status, reason = self._read_status()
File "C:\Users\Vedant Mamgain\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 257, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Users\Vedant Mamgain\AppData\Local\Programs\Python\Python37\lib\socket.py", line 589, in readinto
return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
During handling of the above exception, another exception occurred:

Try using this, it worked for me:
import requests
import bs4 as bs
names = list()
name_links = list()
base_url = 'http://dl5.lavinmovie.net/Series/'
final_list = list()
soup = bs.BeautifulSoup(requests.get(base_url).text, 'lxml')
title = soup.find_all('a')
for link in title[1:]:
names.append(link.text)
current_link = link['href']
print(link.text)
name_links.append(str(current_link))
# get seasons
soup = bs.BeautifulSoup(requests.get(base_url + current_link).text, 'lxml')
title = soup.find_all('a')
for link in title[1:]:
season_link = link['href']
# get quality of the seasons
soup = bs.BeautifulSoup(requests.get(base_url + current_link +season_link).text, 'lxml')
title = soup.find_all('a')
for link in title[1:]:
quality_link = link['href']
# get list of episodes
soup = bs.BeautifulSoup(requests.get(base_url + current_link + season_link + quality_link).text, 'lxml')
title = soup.find_all('a')
for link in title[1:]:
episode_link = link['href']
final_list.a
Check if this works for you.

import requests
import bs4 as bs
urls = 'http://dl5.lavinmovie.net/Series/'
url = requests.get(urls).text
soup = bs.BeautifulSoup(url, 'lxml')
title = soup.find_all('a')
for i in title:
if(i.text != '../' and ".mp4" not in i.text):
urll = urls+i.text
# arr.append(i.text)
urll1 = requests.get(urll).text
soupp1 = bs.BeautifulSoup(urll1, 'lxml')
season = soupp1.find_all('a')
print(i.text)
for j in season:
if(j.text != '../'and ".mp4" not in j.text):
urlla = urll+j.text
urll2 = requests.get(urlla).text
soupp2 = bs.BeautifulSoup(urll2, 'lxml')
quality = soupp2.find_all('a')
print(j.text)
for k in quality:
if(k.text != '../' and ".mp4" not in k.text):
urllb = urlla+k.text
urll3 = requests.get(urllb).text
soupp3 = bs.BeautifulSoup(urll3, 'lxml')
episode = soupp3.find_all('a')
print(k.text)
for m in episode:
if(m.text != '../' and ".mp4" not in m.text):
print(m.text)
I have solved the problem myself as well.Thanks to everyone who helped.

Related

ValueError when trying to write data to xlsx sheet using Openpyxl

I'm trying to save the outputted social media link to an excel file using openpyxl but getting the following error:
Traceback (most recent call last):
File "/Users/xxxx/_Main_.py", line 40, in <module>
sheet.cell(cell.row, col2).value = ig_get_present
File "/Users/xxxx/venv/lib/python3.10/site-packages/openpyxl/cell/cell.py", line 215, in value
self._bind_value(value)
File "/Users/xxxx/venv/lib/python3.10/site-packages/openpyxl/cell/cell.py", line 184, in _bind_value
raise ValueError("Cannot convert {0!r} to Excel".format(value))
ValueError: Cannot convert ['https://www.instagram.com/xxxx/'] to Excel
This is the code leading to it, no idea why its happening.
column_name = 'URL'
column_name2 = 'Instagram'
headers = [cell.value for cell in sheet[1]]
col = get_column_letter(headers.index(column_name) + 1)
col2 = headers.index(column_name2) + 1
for cell in sheet[col][1:]:
url = cell.value
r = requests.get(url)
ig_get = ['instagram.com']
ig_get_present = []
soup = BeautifulSoup(r.content, 'html5lib')
all_links = soup.find_all('a', href=True)
for ig_get in ig_get:
for link in all_links:
if ig_get in link.attrs['href']:
ig_get_present.append(link.attrs['href'])
sheet.cell(cell.row, col2).value = ig_get_present

Converting the data to a string fixed my issue.
ig_got = str(ig_get_present)
sheet.cell(cell.row, col2).value = ig_got

How can i handle memory error on html file?

I have this kind of code
URL = r"C:\Users\jarze\PycharmProjects\CSV-modifier\venv\html.html"
html_report_part1 = open(URL, 'r', encoding="UTF-8").read()
soup = BeautifulSoup(html_report_part1, "html.parser")
and its return this kind of error:
During handling of the above exception, another exception occurred:
MemoryError

Try:
URL = r"C:\Users\jarze\PycharmProjects\CSV-modifier\venv\html.html"
html_report_part1 = open(URL, 'r', encoding="UTF-8")
html_text = ''
for line in html_report_part1.readlines():
html_text += line
soup = BeautifulSoup(html_text, "html.parser")

Python scraping bs4 TypeError: 'NoneType' object is not subscriptable

I hope you're well. Could you please tell why I can use my scraping script properly :)
It works with other website. I'm a beginner so I probably made a basic mistake
import requests
from bs4 import BeautifulSoup
import time
import csv
links = []
for i in range(1):
url = '*******/recettes/?page={}' + str(i)
res = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'})
response = requests.get(url)
print(response)
if response.ok:
print('Page: ' + str(i))
soup = BeautifulSoup(response.text, "html.parser")
divs = soup.findAll('div', class_ = 'field-item even')
for div in divs:
a = div.find('a')
link = a['href']
links.append('*******' + link)
time.sleep(3)
print(len(links))
with open('urls3.txt', 'w') as file:
for link in links:
file.write(link + '\n')
"""
with open('urls3.txt', 'r') as inf:
with open('recipes3.csv', 'w') as outf:
outf.write('titre,image,url,\n')
for row in inf:
url = row.strip()
response = requests.get(url)
if response.ok:
soup = BeautifulSoup(response.text, "html.parser")
titre = soup.find('h1')
image = soup.find('img', {"id":"recipe-media-viewer-thumbnail-1"})['src']
print(titre.text, image, url)
outf.write(str(titre.text) + ',' + str(image) + ',' + str(url) + '\n')
time.sleep(1)
"""
Could you please tell me why there is a mistake here :
<Response [200]>
Page: 0
Traceback (most recent call last):
File "ex3.py", line 18, in <module>
link = a['href']
TypeError: 'NoneType' object is not subscriptable

I've found the answer, I post it here :) for anyone interested
try:
image = soup.find('img', {"id":"recipe-media-viewer-thumbnail-1"})['src']
except Exception as e:
image = None

How to fix "cannot serialize '_io.BufferedReader' object" while multiprocessing?

I get the following error when trying to parse a large number of web pages from a website : "Reason: 'TypeError("cannot serialize '_io.BufferedReader' object",)'. How can I fix it ?
full error message is :
File "main.py", line 29, in <module>
records = p.map(defs.scrape,state_urls)
File "C:\Users\Utilisateur\Anaconda3\lib\multiprocessing\pool.py", line 266, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "C:\Users\Utilisateur\Anaconda3\lib\multiprocessing\pool.py", line 644, in get
raise self._value
multiprocessing.pool.MaybeEncodingError: Error sending result: '<multiprocessing.pool.ExceptionWithTraceback object at 0x0000018DD1C3D828>'. Reason: 'TypeError("cannot serialize '_io.BufferedReader' object",)'
I browsed through some of the answers for similar questions here, namely this one (multiprocessing.pool.MaybeEncodingError: Error sending result: Reason: 'TypeError("cannot serialize '_io.BufferedReader' object",)') but I don't think I'm running into the same issue, as I don't handle files directly in the scrape function.
I tried modifying the scrape function so it returned a string and not a list (don't know why I did that) but I didn't work.
From the main.py file :
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from multiprocessing import Pool
import codecs
import defs
if __name__ == '__main__':
filename = "some_courts_test.csv"
# not the actual values
courts = ["blabla", "blablabla", "blablabla","blabla"]
client = defs.init_client()
i = 1
# scrapes the data from the website and puts it into a csv file
for court in courts:
records = []
records_string =""
print("creating a file for the court of : "+court)
f = defs.init_court_file(court)
print("generating urls for the court of "+court)
state_urls = defs.generate_state_urls(court)
for url in state_urls:
print(url)
print("scraping creditors from : "+court)
p = Pool(10)
records = p.map(defs.scrape,state_urls)
records_string = ''.join(records[1])
p.terminate()
p.join()
for r in records_string:
f.write(r)
records = []
f.close()
from the defs file:
def scrape(url):
data = []
row_string = ' '
final_data = []
final_string = ' '
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
table = page_soup.find("table", {"class":"table table-striped"})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.replace(',',' ') for ele in cols] #cleans it up
for ele in cols:
if ele:
data.append(ele)
data.append(',')
data.append('\n')
return(data)

Error when scraping in Python, need to bypass

import requests
from bs4 import BeautifulSoup
import csv
from urlparse import urljoin
import urllib2
outfile = open("./battingall.csv", "wb")
writer = csv.writer(outfile)
base_url = 'http://www.baseball-reference.com'
player_url = 'http://www.baseball-reference.com/players/'
alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
players = 'shtml'
gamel = '&t=b&year='
game_logs = 'http://www.baseball-reference.com/players/gl.cgi?id='
years = ['2015','2014','2013','2012','2011','2010','2009','2008']
drounders = []
for dround in alphabet:
drounders.append(player_url + dround)
urlz = []
for ab in drounders:
data = requests.get(ab)
soup = BeautifulSoup(data.content)
for link in soup.find_all('a'):
if link.has_attr('href'):
urlz.append(base_url + link['href'])
yent = []
for ant in urlz:
for d in drounders:
for y in years:
if players in ant:
if len(ant) < 60:
if d in ant:
yent.append(game_logs + ant[44:-6] + gamel + y)
for j in yent:
try:
data = requests.get(j)
soup = BeautifulSoup(data.content)
table = soup.find('table', attrs={'id': 'batting_gamelogs'})
tablea = j[52:59]
tableb= soup.find("b", text='Throws:').next_sibling.strip()
tablec= soup.find("b", text='Height:').next_sibling.strip()
tabled= soup.find("b", text='Weight:').next_sibling.strip()
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
list_of_cells.append(tablea)
list_of_cells.append(j[len(j)-4:])
list_of_cells.append(tableb)
list_of_cells.append(tablec)
list_of_cells.append(tabled)
for cell in row.findAll('td'):
text = cell.text.replace(' ', '').encode("utf-8")
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
print list_of_rows
writer.writerows(list_of_rows)
except (AttributeError,NameError):
pass
When I run this code to get gamelog batting data I keep getting an error:
Traceback (most recent call last):
File "battinggamelogs.py", line 44, in <module>
data = requests.get(j)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site- packages/requests/api.py", line 65, in get
return request('get', url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site- packages/requests/api.py", line 49, in request
response = session.request(method=method, url=url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/sessions.py", line 461, in request
resp = self.send(prep, **send_kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/sessions.py", line 573, in send
r = adapter.send(request, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/adapters.py", line 415, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', BadStatusLine("''",))
I need a way to bypass this error to keep going. I think the reason the error comes up because there is no table to get data from.

You can wrap your requests.get() block in a try/except. You need to catch the requests.exceptions.ConnectionError that is being generated.
for ab in drounders:
try:
data = requests.get(ab)
soup = BeautifulSoup(data.content)
for link in soup.find_all('a'):
if link.has_attr('href'):
urlz.append(base_url + link['href'])
except requests.exceptions.ConnectionError:
pass
This is occurring because the connection, itself, has a problem, not because there is no data in the table. You aren't even getting that far.
Note: This is completely eating the exception by simply using pass (as you are also doing later in the code block). It may be better to do something like this:
except requests.exceptions.ConnectionError:
print("Failed to open {}".format(ab))
This will provide you with a message on the console of what URL is failing.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to do nested link scraping using BeautifulSoup and requests - python

Related

ValueError when trying to write data to xlsx sheet using Openpyxl

How can i handle memory error on html file?

Python scraping bs4 TypeError: 'NoneType' object is not subscriptable

How to fix "cannot serialize '_io.BufferedReader' object" while multiprocessing?

Error when scraping in Python, need to bypass

Categories

Resources