I have a script in Google Appengine that is started every 20 minutes by cron.yaml. This works locally, on my own machine. When I go (manually) to the url which starts the script online, it also works. However, the script always fails to complete online, on Google's instances, when cron.yaml is in charge of starting it.
The log shows no errors, only 2 debug messages:
D 2013-07-23 06:00:08.449
type(soup): <class 'bs4.BeautifulSoup'> END type(soup)
D 2013-07-23 06:00:11.246
type(soup): <class 'bs4.BeautifulSoup'> END type(soup)
Here's my script:
# coding: utf-8
import jinja2, webapp2, urllib2, re
from bs4 import BeautifulSoup as bs
from google.appengine.api import memcache
from google.appengine.ext import db
class Article(db.Model):
content = db.TextProperty()
datetime = db.DateTimeProperty(auto_now_add=True)
companies = db.ListProperty(db.Key)
url = db.StringProperty()
class Company(db.Model):
name = db.StringProperty()
ticker = db.StringProperty()
#property
def articles(self):
return Article.gql("WHERE companies = :1", self.key())
def companies_key(companies_name=None):
return db.Key.from_path('Companies', companies_name or 'default_companies')
def articles_key(articles_name=None):
return db.Key.from_path('Articles', articles_name or 'default_articles')
def scrape():
companies = memcache.get("companies")
if not companies:
companies = Company.all()
memcache.add("companies",companies,30)
for company in companies:
links = links(company.ticker)
links = set(links)
for link in links:
if link is not "None":
article_object = Article()
text = fetch(link)
article_object.content = text
article_object.url = link
article_object.companies.append(company.key()) #doesn't work.
article_object.put()
def fetch(link):
try:
html = urllib2.urlopen(url).read()
soup = bs(html)
except:
return "None"
text = soup.get_text()
text = text.encode('utf-8')
text = text.decode('utf-8')
text = unicode(text)
if text is not "None":
return text
else:
return "None"
def links(ticker):
url = "https://www.google.com/finance/company_news?q=NASDAQ:" + ticker + "&start=10&num=10"
html = urllib2.urlopen(url).read()
soup = bs(html)
div_class = re.compile("^g-section.*")
divs = soup.find_all("div", {"class" : div_class})
links = []
for div in divs:
a = unicode(div.find('a', attrs={'href': re.compile("^http://")}))
link_regex = re.search("(http://.*?)\"",a)
try:
link = link_regex.group(1)
soup = bs(link)
link = soup.get_text()
except:
link = "None"
links.append(link)
return links
...and the script's handler in main:
class ScrapeHandler(webapp2.RequestHandler):
def get(self):
scrape.scrape()
self.redirect("/")
My guess is that the problem might be the double for loop in the scrape script, but I don't understand exactly why.
Update:
Articles are indeed being scraped (as many as there should be), and now there are no log errors, or even debug messages at all. Looking at the log, the cron job seemed to execute perfectly. Even so, Appengine's cron job panel says the cron job failed.
I,m pretty sure this error was due to DeadlineExceededError, which I did not run into locally. My scrape() script now does its thing on fewer companies and articles, and does not run into the exceeded deadline.
Related
new to web scraping (using python) and encountered a problem trying to get an email from a university's athletic department site.
I've managed to get to navigate to the email I want to extract but don't know where to go from here. When I print what I have, all I get is '' and not the actual text of the email.
I'm attaching what I have so far, let me know if it needs a better explanation.
Here's a link to an image of what I'm trying to scrape. Website
and the website: https://goheels.com/staff-directory
Thanks!
Here's my code:
from bs4 import BeautifulSoup
import requests
urls = ''
with open('websites.txt', 'r') as f:
for line in f.read():
urls += line
urls = list(urls.split())
print(urls)
for url in urls:
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')
try:
body = soup.find(headers="col-staff_email category-0")
links = body.a
print(links)
except Exception as e:
print(f'"This url didn\'t work:" {url}')
The emails are hidden inside a <script> element. With a little pushing, shoving, css selecting and string splitting you can get there:
for em in soup.select('td[headers*="col-staff_email"] script'):
target = em.text.split('var firstHalf = "')[1]
fh = target.split('";')[0]
lh = target.split('var secondHalf = "')[1].split('";')[0]
print(fh+ '#' +lh)
Output:
bubba.cunningham#unc.edu
molly.dalton#unc.edu
athgallo#unc.edu
dhollier#unc.edu
etc.
Recently I started getting acquainted with the web and in particular with web scrapers. For better understanding, I decided to implement a small program. I want to make a scraper that collects all the links that users leave in the comments from the posts of the /r/Python Reddit thread.
Here is the code I got:
from bs4 import BeautifulSoup
import requests
from urllib.error import HTTPError
class Post:
def __init__(self, thread, title, url, inner_links=None):
if inner_links is None:
inner_links = []
self.thread = thread
self.title = title
self.url = url
self.inner_links = inner_links
def get_new_posts_reddit(thread: str):
reddit_url = 'https://www.reddit.com'
html = requests.get(reddit_url + '/r/' + thread).content.decode('utf8')
bs = BeautifulSoup(html, 'html.parser')
posts = []
try:
for post_link in bs.find_all('a', class_='SQnoC3ObvgnGjWt90zD9Z _2INHSNB8V5eaWp4P0rY_mE'):
posts.append(Post(thread, post_link.text, reddit_url + post_link['href']))
except HTTPError:
return []
return posts
def get_inner_links(post: Post):
html = requests.get(post.url).content.decode('utf8')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a', class_='_3t5uN8xUmg0TOwRCOGQEcU'):
post.inner_links.append({'text': link.find_parent('div').text, 'link': link['href']})
python_posts = get_new_posts_reddit('Python')
for elem in python_posts:
get_inner_links(elem)
with open('result.txt', 'w', encoding="utf8") as file:
for elem in python_posts:
file.write(str(elem.inner_links) + '\n')
The main problem is that sometimes this program works and sometimes it doesn't. That is, in 1 run out of 5, it will collect the first 7 posts from the thread and then find internal links, again, only in one of the 7 posts. I think the problem might be that I send requests to the site too often or something like that. Please help me figure this out
I found out that the problem was that I was getting a page where the content hadn't loaded yet. I rewrote the parser on selenium and everything worked.
Why is it just printing 0 when I run this code?
I am trying to print the articles' title, link, and date. Is the .aspx link possibly a problem for this method?
I originally was trying an rss version https://codeburst.io/building-an-rss-feed-scraper-with-python-73715ca06e1f and changing that to an individual website because the rss doesn't give me the info I actually need. Thanks for the help!
from bs4 import BeautifulSoup
import requests
url= 'https://tymeinc.com/newsroom/press-releases/default.aspx'
def news(url):
try:
r = requests.get(url)
soup = BeautifulSoup(r.content, features= 'xml')
articles = soup.findAll('blog_section')
print(len(articles))
for a in articles:
title = a.find('blog_title').text
link = a.find('blog_link').text
published = a.find('module_date-time').text
description = a.find('blog_short-body').text
article = {'blog_title': title,'blog_link': link,'module_date-time': published}
articles.append(article)
return print(url)
return print(title)
#return print(articles, "done")
except Exception as e:
print('The scraping job failed. See exception: ', e)
news(url)
I recently started learning Python. In the process of learning about web scraping, I followed an example to scrape from Google News. After running my code, I get the message: "Process finished with exit code 0" with no results. If I change the url to "https://yahoo.com" I get results. Could anyone point out what, if anything I am doing wrong?
Code:
import urllib.request
from bs4 import BeautifulSoup
class Scraper:
def __init__(self, site):
self.site = site
def scrape(self):
r = urllib.request.urlopen(self.site)
html = r.read()
parser = "html.parser"
sp = BeautifulSoup(html, parser)
for tag in sp.find_all("a"):
url = tag.get("href")
if url is None:
continue
if "html" in url:
print("\n" + url)
news = "https://news.google.com/"
Scraper(news).scrape()
Try this out:
import urllib.request
from bs4 import BeautifulSoup
class Scraper:
def __init__(self, site):
self.site = site
def scrape(self):
r = urllib.request.urlopen(self.site)
html = r.read()
parser = "html.parser"
sp = BeautifulSoup(html, parser)
for tag in sp.find_all("a"):
url = tag.get("href")
if url is None:
continue
else:
print("\n" + url)
if __name__ == '__main__':
news = "https://news.google.com/"
Scraper(news).scrape()
Initially you were checking each link to see if it contained 'html' in it. I am assuming the example you were following was checking to see if the links ended in '.html;
Beautiful soup works really well, but you need to check the source code on the website your scraping to get an idea for how the code is layed out. Devtools in chrome works really well for this, F12 to get their quick.
I removed:
if "html" in url:
print("\n" + url)
and replaced it with:
else:
print("\n" + url)
I am working through a book "The Self-taught programmer" and am having trouble with some python code. I get the program to run without any errors. The problem is that there is no output whatsoever.
import urllib.request
from bs4 import BeautifulSoup
class Scraper:
def __init__(self, site):
self.site = site
def scrape(self):
r = urllib.request\
.urlopen(self.site)
html = r.read()
parser = "html.parser"
sp = BeautifulSoup(html, parser)
for tag in sp.find_all("a"):
url = tag.get("href")
if url is None:
continue
if "html" in url:
print("\n" + url)
news = "https://news.google.com/"
Scraper(news).scrape()
Look at the last "if" statement. If there's no text "html" in the url, nothing gets printed. Try removing that and un-indenting:
class Scraper:
def __init__(self, site):
self.site = site
def scrape(self):
r = urllib.request\
.urlopen(self.site)
html = r.read()
parser = "html.parser"
sp = BeautifulSoup(html, parser)
for tag in sp.find_all("a"):
url = tag.get("href")
if url is None:
continue
print("\n" + url)