Looping url from csv file for scrape html with python - python

I'm learning to use python to scrape websites (online store)
I'm create a blocke code to scrape a website, where the url to scrape is in the CSV file that I will load to scrape.
However, after running the repetition can only work once in one of the lines does not reach the end of the url in the CSV and does not continue to the next URL.
disc_information = html.find('div', class_='alert alert-info global-promo').text.strip().strip('\n')
AttributeError: 'NoneType' object has no attribute 'text'
how do I get through if an error occurs when html is not found?
the following line of code I use python, please help so that the looping scrape runs to the end of the url list
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv
import pandas
with open('Url Torch.csv','rt') as f:
data = csv.reader(f, delimiter=',')
for row in data:
URL_GO = row[2]
def variable_Scrape(url):
try:
cookies = dict(cookie="............")
request = requests.get(url, cookies=cookies)
html = BeautifulSoup(request.content, 'html.parser')
title = html.find('div', class_='title').text.strip().strip('\n')
desc = html.find('div', class_='content').text
link = html.find_all('img', class_='lazyload slide-item owl-lazy')
normal_price = html.find('div', class_='amount public').text.strip().strip('\n')
disc_information = html.find('div', class_='alert alert-info global-promo').text.strip().strip('\n')
except AttributeError as e:
print(e)
#ConnectionAbortedError
return False
else:
print(title)
#print(desc)
#print(link)
finally:
print(title)
print(desc)
print(link)
print('Finally.....')
variable_Scrape(URL_GO)

You should call the variable_Scrape(URL_GO) method inside the for-loop and make sure you declare the method before it uses.

Related

Web Scraping Emails using Python

new to web scraping (using python) and encountered a problem trying to get an email from a university's athletic department site.
I've managed to get to navigate to the email I want to extract but don't know where to go from here. When I print what I have, all I get is '' and not the actual text of the email.
I'm attaching what I have so far, let me know if it needs a better explanation.
Here's a link to an image of what I'm trying to scrape. Website
and the website: https://goheels.com/staff-directory
Thanks!
Here's my code:
from bs4 import BeautifulSoup
import requests
urls = ''
with open('websites.txt', 'r') as f:
for line in f.read():
urls += line
urls = list(urls.split())
print(urls)
for url in urls:
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')
try:
body = soup.find(headers="col-staff_email category-0")
links = body.a
print(links)
except Exception as e:
print(f'"This url didn\'t work:" {url}')
The emails are hidden inside a <script> element. With a little pushing, shoving, css selecting and string splitting you can get there:
for em in soup.select('td[headers*="col-staff_email"] script'):
target = em.text.split('var firstHalf = "')[1]
fh = target.split('";')[0]
lh = target.split('var secondHalf = "')[1].split('";')[0]
print(fh+ '#' +lh)
Output:
bubba.cunningham#unc.edu
molly.dalton#unc.edu
athgallo#unc.edu
dhollier#unc.edu
etc.

Terminal printing 0 when running web scraping code

Why is it just printing 0 when I run this code?
I am trying to print the articles' title, link, and date. Is the .aspx link possibly a problem for this method?
I originally was trying an rss version https://codeburst.io/building-an-rss-feed-scraper-with-python-73715ca06e1f and changing that to an individual website because the rss doesn't give me the info I actually need. Thanks for the help!
from bs4 import BeautifulSoup
import requests
url= 'https://tymeinc.com/newsroom/press-releases/default.aspx'
def news(url):
try:
r = requests.get(url)
soup = BeautifulSoup(r.content, features= 'xml')
articles = soup.findAll('blog_section')
print(len(articles))
for a in articles:
title = a.find('blog_title').text
link = a.find('blog_link').text
published = a.find('module_date-time').text
description = a.find('blog_short-body').text
article = {'blog_title': title,'blog_link': link,'module_date-time': published}
articles.append(article)
return print(url)
return print(title)
#return print(articles, "done")
except Exception as e:
print('The scraping job failed. See exception: ', e)
news(url)

Web scraping python for Arabic text

I am trying to web Scrape the website: "http://norumors.net/?post_type=rumors?post_type=rumors" to get only the heading news and put them in a CSV file using Beautifulsoup and python, This is the code I am using after i look into the HTML source code "view-source:http://norumors.net/?post_type=rumors?post_type=rumors"
import urllib.request,sys,time
from bs4 import BeautifulSoup
import requests
import pandas as pd
pagesToGet= 1
upperframe=[]
for page in range(1,pagesToGet+1):
print('processing page :', page)
url = 'http://norumors.net/?post_type=rumors/?page='+str(page)
print(url)
#an exception might be thrown, so the code should be in a try-except block
try:
#use the browser to get the url. This is suspicious command that might blow up.
page=requests.get(url) # this might throw an exception if something goes wrong.
except Exception as e: # this describes what to do if an exception is thrown
error_type, error_obj, error_info = sys.exc_info() # get the exception information
print ('ERROR FOR LINK:',url) #print the link that cause the problem
print (error_type, 'Line:', error_info.tb_lineno) #print error info and line that threw the exception
continue #ignore this page. Abandon this and go back.
time.sleep(2)
soup=BeautifulSoup(page.text,'html.parser')
frame=[]
links=soup.find_all('li',attrs={'class':'o-listicle__item'})
print(len(links))
filename="NEWS.csv"
f=open(filename,"w", encoding = 'utf-8')
headers="Statement,Link\n"
f.write(headers)
for j in links:
Statement = j.find("div",attrs={'class':'row d-flex'}).text.strip()
# Link = "http://norumors.net/"
Link += j.find("div",attrs={'class':'col-lg-4 col-md-4 col-sm-6 col-xs-6'}).find('a')['href'].strip()
frame.append((Statement,Link))
f.write(Statement.replace(",","^")+","+Link+","+Date.replace(",","^")+","+Source.replace(",","^")+","+Label.replace(",","^")+"\n")
upperframe.extend(frame)
f.close()
data=pd.DataFrame(upperframe, columns=['Statement','Link'])
data.head()
but After I run the code I am getting the pandas data frame and CSV file empty, any suggestion why is that? knowing that i want to get the text between tags.
If I understand correctly, you want to get the text part of the news headlines and the href link to these news. You further want to write them into a CSV file. The problem with your code is for j in links: is not executed because soup.find_all('li',attrs={'class':'o-listicle__item'}) returns an empty list. You should be careful with the names and classes of the tags that you are searching. Below code gets news texts and their links, it also writes them to the CSV file using pd.DataFrame.
import urllib.request,sys,time
from bs4 import BeautifulSoup
import requests
import pandas as pd
pagesToGet = 1
for page in range(1,pagesToGet+1):
print('processing page :', page)
url = 'http://norumors.net/?post_type=rumors/?page=' + str(page)
print(url)
#an exception might be thrown, so the code should be in a try-except block
try:
#use the browser to get the url. This is suspicious command that might blow up.
page = requests.get(url) # this might throw an exception if something goes wrong.
except Exception as e: # this describes what to do if an exception is thrown
error_type, error_obj, error_info = sys.exc_info() # get the exception information
print('ERROR FOR LINK:',url) #print the link that cause the problem
print(error_type, 'Line:', error_info.tb_lineno) #print error info and line that threw the exception
continue #ignore this page. Abandon this and go back.
soup = BeautifulSoup(page.text,'html.parser')
texts = []
links = []
filename = "NEWS.csv"
f = open(filename,"w", encoding = 'utf-8')
Statement = soup.find("div",attrs={'class':'row d-flex'})
divs = Statement.find_all("div",attrs={'class':'col-lg-4 col-md-4 col-sm-6 col-xs-6'})
for div in divs:
txt = div.find("img",attrs={'class':'rumor__thumb'})
texts.append(txt['alt'])
lnk = div.find("a",attrs={'class':'rumor--archive'})
links.append(lnk['href'])
data = pd.DataFrame(list(zip(texts, links)), columns=['Statement', 'Link'])
data.to_csv(f, encoding='utf-8', index=False)
f.close()

Web Scraping - Extract list of text from multiple pages

I want to extract a list of names from multiple pages of a website.
The website has over 200 pages and i want to save all the names to a text file. I have wrote some code but it's giving me index error.
CODE:
import requests
from bs4 import BeautifulSoup as bs
URL = 'https://hamariweb.com/names/muslim/boy/page-'
#for page in range(1, 203):
page = 1
req = requests.get(URL + str(page))
soup = bs(req.text, 'html.parser')
row = soup.find('div', attrs={'class', 'row'})
books = row.find_all('a')
for book in books:
data = book.find_all('b')[0].get_text()
print(data)
OUTPUT:
Aabbaz
Aabid
Aabideen
Aabinus
Aadam
Aadeel
Aadil
Aadroop
Aafandi
Aafaq
Aaki
Aakif
Aalah
Aalam
Aalamgeer
Aalif
Traceback (most recent call last):
File "C:\Users\Mujtaba\Documents\names.py", line 15, in <module>
data = book.find_all('b')[0].get_text()
IndexError: list index out of range
>>>
The reason for getting the error is since it can't find a <b> tag.
Try this code to request each page and save the data to a file:
import requests
from bs4 import BeautifulSoup as bs
MAIN_URL = "https://hamariweb.com/names/muslim/boy/"
URL = "https://hamariweb.com/names/muslim/boy/page-{}"
with open("output.txt", "a", encoding="utf-8") as f:
for page in range(203):
if page == 0:
req = requests.get(MAIN_URL.format(page))
else:
req = requests.get(URL.format(page))
soup = bs(req.text, "html.parser")
print(f"page # {page}, Getting: {req.url}")
book_name = (
tag.get_text(strip=True)
for tag in soup.select(
"tr.bottom-divider:nth-of-type(n+2) td:nth-of-type(1)"
)
)
f.seek(0)
f.write("\n".join(book_name) + "\n")
I suggest to change your parser to html5lib #pip install html5lib. I just think it's better. Second It's better NOT to do a .find() from your soup object DIRECTLY since it might cause some problems where the tags and classes might have duplicates. SO you might be finding data on a html tag where your data isn't even there. So it's better to check everything and inspect element the the tags you want to get and see on what block of code they might be in cause it is easier that way to scrape, also to avoid more errors.
What I did there is I inspected the elements first and FIND the BLOCK of code where you want to get your data and I found that it is on a div and its class is mb-40 content-box that is where all the names you are trying to get are. Luckily the class is UNIQUE and there are no other elements with the same tag and class so we can just directly .find() it.
Then the value of trs are simply the tr tags inside of that block
(Take note also that those <tr> tags are inside of a <table> tag but the good thing is those are the only <tr> tags that exist so there wouldn't be much of a problem like if there would be another <table> tag with the same class value)
which the <tr> tags contains the names you want to get. You may ask why is there [1:] it's because to start at index 1 to NOT include the Header from the table on the website.
Then just loop through those tr tags and get the text. With regards to your error on why is it happening it is simply because of index out of range you are trying to access a .find_all() result list item where it is out of bounds and this might happen if cases that there are no such data that is being found and that also might happen if you DIRECTLY do a .find() function on your soup variable, because there would be times that there are tags and their respective class values are the same BUT! WITH DIFFERENT CONTENT WITHIN IT. So what happens is you're expecting to scrape that particular part of the website but what actually happening is you're scraping a different part, that's why you might not get any data and wonder why it is happening.
import requests
from bs4 import BeautifulSoup as bs
URL = 'https://hamariweb.com/names/muslim/boy/page-'
#for page in range(1, 203):
page = 1
req = requests.get(URL + str(page))
soup = bs(req.content, 'html5lib')
div_container = soup.find('div', class_='mb-40 content-box')
trs = div_container.find_all("tr",class_="bottom-divider")[1:]
for tr in trs:
text = tr.find("td").find("a").text
print(text)
The issue you're having with the IndexError means that in this case the b-tag you found doesn't contains the information that you are looking for.
You can simply wrap that piece of code in a try-except clause.
for book in books:
try:
data = book.find_all('b')[0].get_text()
print(data)
# Add data to the all_titles list
all_titles.append(data)
except IndexError:
pass # There was no element available
This will catch you error and move on. But not break the code.
Below I have also added some extra lines to save your title to a text-file.
Take a look at the inline comments.
import requests
from bs4 import BeautifulSoup as bs
URL = 'https://hamariweb.com/names/muslim/boy/page-'
# Theres is where your titles will be saved. Changes as needed
PATH = '/tmp/title_file.txt'
page = 1
req = requests.get(URL + str(page))
soup = bs(req.text, 'html.parser')
row = soup.find('div', attrs={'class', 'row'})
books = row.find_all('a')
# Here your title will be stored before writing to file
all_titles = []
for book in books:
try:
# Add strip() to cleanup the input
data = book.find_all('b')[0].get_text().strip()
print(data)
# Add data to the all_titles list
all_titles.append(data)
except IndexError:
pass # There was no element available
# Open path to write
with open(PATH, 'w') as f:
# Write all titles on a new line
f.write('\n'.join(all_titles))

Web scraping with python how to get to the text

I'm trying to get the text from a website but can't find a way do to it. How do I need to write it?
link="https://www.ynet.co.il/articles/0,7340,L-5553905,00.html"
response = requests.get(link)
soup = BeautifulSoup(response.text,'html.parser')
info = soup.find('div', attrs={'class':'text14'})
name = info.text.strip()
print(name)
This is how it looks:
i'm getting none everytime
import requests
from bs4 import BeautifulSoup
import json
link="https://www.ynet.co.il/articles/0,7340,L-5553905,00.html"
response = requests.get(link)
soup = BeautifulSoup(response.text,'html.parser')
info = soup.findAll('script',attrs={'type':"application/ld+json"})[0].text.strip()
jsonDict = json.loads(info)
print(jsonDict['articleBody'])
The page seems to store all the article data in json in the <script> tag so try this code.
The solution is :
info = soup.find('meta', attrs={'property':'og:description'})
It gave me the text i needed

Categories

Resources