I am writing the back-end of an answers aggregator site in python using the Bing API (python bindings here:http://uswaretech.com/blog/2009/06/bing-python-api/). The following is my code:
#!/usr/bin/python
from bingapi import bingapi
import re
import cgi
import cgitb
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
def strip_tags2(data):
p = re.compile(r'<[^<]*?>')
q = re.compile(r'[&;!##$%^*()]*')
data = p.sub('', data)
return q.sub('', data)
def getUrl(item):
return item['Url']
def getContent(item):
return item['Description']
def getInfo(siteStr, qry):
query = "{0} {1}".format(qry, siteStr)
bing = bingapi.Bing('APP_ID_HERE')
j = bing.do_web_search(query)
results = j['SearchResponse']['Web']['Results']
return result
def updateRecent(qry):
f = open("recent.txt", "r")
lines = f.readlines()
f.close()
lines = lines[1:]
if len(qry) > 50: #truncate if string too long
qry = (qry[:50] + '...')
qry = strip_tags2(qry) #strip out the html if injection try
lines.append("\n%s" % qry)
f = open("recent.txt", "w")
f.writelines(lines)
f.close()
if __name__ == '__main__':
form = cgi.FieldStorage()
qry = form["qry"].value
qry = r'%s' % qry
updateRecent(qry)
siteStr = "site:answers.yahoo.com OR site:chacha.com OR site:blurtit.com OR site:answers.com OR site:question.com OR site:answerbag.com OR site:stackexchange.com"
print "Content-type: text/html"
print
header = open("header.html", "r")
contents = header.readlines()
header.close()
for item in contents:
print item
print """
<div id="results">
<center><h1>Results:</h1></center>
"""
print getInfo(siteStr, qry)
for item in getInfo(siteStr, qry):
print "<h3>%s</h3>" % getUrl(item)
print "<br />"
print "<p style=\"color:gray\">%s</p>" % getContent(item)
print "<br />"
print "</div>"
footer = open("footer.html", "r")
contents = footer.readlines()
footer.close()
for thing in contents:
print thing
For some reason when I run this in my browser (sending it a query using a text input) it doesn't print anything. Can someone explain why this is happening? Thx in advance!
Nevermind, just found a syntax error that apache didn't pick up I guess. In the "getInfo()" function it says "return result" when it should say "return results".
Related
My parser load only first href for list, but I need to load all href for list. This is my code:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
from lxml import html
import urllib.parse
import urllib.request
class VashMagaz(object):
RESULT = []
def parse_vashmagaz_run(self):
url = 'https://vashmagazin.ua/nerukhomist/kvartyry/'
r = requests.get(url)
res = html.fromstring(r.content)
result = res.xpath(u'//*[contains(text(), "120")]/#href')
num = self._get_page_num(result[0])
result = self.get_page_data(num)
return result
def get_page_data(self, num):
url = 'https://vashmagazin.ua/nerukhomist/kvartyry/?item_price1=&item_price2=&page={}'
for i in range(1, num):
r = requests.get(url.format(i))
self.get_all(r.content)
return self.RESULT
def _get_page_num(self,href):
result = urllib.parse.urlparse(href)
result = urllib.parse.parse_qs(result.query)
return int(result['page'][0])
def get_all(self, data):
data = self._get_desc(data)
for key, i in enumerate(data):
text = i.xpath('.//h3[#class="ner_h3"]/a/text()')[key]
href = i.xpath('.//h3[#class="ner_h3"]/a/#href')[key]
self.RESULT.append({'text': text,
'href': 'https://vashmagazin.ua/' + href,
})
def _get_desc(self, data):
return self.get_from_xpath(data, '//*[#id="price"]')
def get_from_xpath(self, data, xpath):
res = html.fromstring(data)
return res.xpath(xpath)
if __name__ == '__main__':
magaz = VashMagaz()
magaz.parse_vashmagaz_run()
msg = u'Subject: Квартири'+"\n"
for res in magaz.RESULT:
for k, i in res.items():
msg+=str(res[k]).strip()+"\n"
msg+='-------------------------------'+'\n'
print(msg)
I'm trying to use multithreads to go through a txt file of urls and scrape the contents found at each url. This works for about 20 URLs (not consistent how many) but then consistently gets stuck on the last url in the file. It doesn't seem to be doing them in order.
I have no idea why it's getting stuck or where to start so thank you so much for your help.
from bs4 import BeautifulSoup, SoupStrainer
import urllib3
import urllib2
import io
import os
import re
import workerpool
from urllib2 import Request, urlopen, URLError
NUM_SOCKETS = 3
NUM_WORKERS = 5
urlfile = open("dailynewsurls.txt",'r') # read one line at a time until end of file
http = urllib3.PoolManager(maxsize=NUM_SOCKETS)
workers = workerpool.WorkerPool(size=NUM_WORKERS)
class MyJob(workerpool.Job):
def __init__(self, url):
self.url = url
def run(self):
r = http.request('GET', self.url)
req = urllib2.Request(url)
try:
page = urllib2.urlopen(req)
except:
print "had to skip one"
return
pagecontent = page.read() # get a file-like object at this url
#this tells it to soup the page that is at the url above
soup = BeautifulSoup(pagecontent)
#this tells it to find the string in the first instance of each of the tags in the parenthesis
title = soup.find_all('title')
article = soup.find_all('article')
try:
title = str(title[0].get_text().encode('utf-8'))
except:
print "had to skip one"
return
try:
article = str(article[0].get_text().encode('utf-8'))
except:
print "had to skip one"
return
try:
# make the file using the things above
output_files_pathname = 'DailyNews/' # path where output will go
new_filename = title + ".txt"
# write each of the things defined into the text file
outfile = open(output_files_pathname + new_filename,'w')
outfile.write(title)
outfile.write("\n")
outfile.write(article)
outfile.close()
print "%r added as a text file" % title
return
except:
print "had to skip one"
return
return
for url in urlfile:
workers.put(MyJob(url))
workers.shutdown()
workers.wait()
print "All done."
Here's an example list of the urls:
http://www.nydailynews.com/entertainment/tv-movies/x-factor-season-2-episode-2-recap-oops-britney-spears-article-1.1159546
http://www.nydailynews.com/new-york/brooklyn/lois-mclohon-resurfaced-iconic-daily-news-coney-island-cheesecake-photo-brings-back-memories-50-year-long-romance-article-1.1160457
http://www.nydailynews.com/new-york/uptown/espaillat-linares-rivals-bitter-history-battle-state-senate-seat-article-1.1157994
http://www.nydailynews.com/sports/baseball/mlb-power-rankings-yankees-split-orioles-tumble-rankings-nationals-shut-stephen-strasburg-hang-top-spot-article-1.1155953
http://www.nydailynews.com/news/national/salon-sell-internet-online-communities-article-1.1150614
http://www.nydailynews.com/sports/more-sports/jiyai-shin-wins-women-british-open-dominating-fashion-record-nine-shot-victory-article-1.1160894
http://www.nydailynews.com/entertainment/music-arts/justin-bieber-offered-hockey-contract-bakersfield-condors-minor-league-team-article-1.1157991
http://www.nydailynews.com/sports/baseball/yankees/umpire-blown-call-9th-inning-dooms-yankees-5-4-loss-baltimore-orioles-camden-yards-article-1.1155141
http://www.nydailynews.com/entertainment/gossip/kellie-pickler-shaving-head-support-best-friend-cancer-fight-hair-article-1.1160938
http://www.nydailynews.com/new-york/secret-103-000-settlement-staffers-accused-assemblyman-vito-lopez-sexual-harassment-included-penalty-20k-involved-talked-details-article-1.1157849
http://www.nydailynews.com/entertainment/tv-movies/ricki-lake-fun-adds-substance-new-syndicated-daytime-show-article-1.1153301
http://www.nydailynews.com/sports/college/matt-barkley-loyalty-usc-trojans-contention-bcs-national-championship-article-1.1152969
http://www.nydailynews.com/sports/daily-news-sports-photos-day-farewell-andy-roddick-world-1-u-s-open-champ-retires-loss-juan-martin-del-potro-article-1.1152827
http://www.nydailynews.com/entertainment/gossip/britney-spears-made-move-relationship-fiance-jason-trawick-reveals-article-1.1152722
http://www.nydailynews.com/new-york/brooklyn/brooklyn-lupus-center-tayumika-zurita-leads-local-battle-disease-difficult-adversary-article-1.1153494
http://www.nydailynews.com/life-style/fashion/kate-middleton-prabal-gurung-dress-sells-hour-myhabit-site-sold-1-995-dress-599-article-1.1161583
http://www.nydailynews.com/news/politics/obama-romney-campaigns-vie-advantage-president-maintains-lead-article-1.1161540
http://www.nydailynews.com/life-style/free-cheap-new-york-city-tuesday-sept-11-article-1.1155950
http://www.nydailynews.com/news/world/dozens-storm-embassy-compound-tunis-article-1.1159663
http://www.nydailynews.com/opinion/send-egypt-message-article-1.1157828
http://www.nydailynews.com/sports/more-sports/witnesses-feel-sheryl-crow-lance-amstrong-activities-article-1.1152899
http://www.nydailynews.com/sports/baseball/yankees/hiroki-kuroda-replacing-cc-sabathia-yankees-ace-pitcher-real-possibility-playoffs-looming-article-1.1161812
http://www.nydailynews.com/life-style/eats/finland-hosts-pop-down-restaurant-belly-earth-262-feet-underground-article-1.1151523
http://www.nydailynews.com/sports/more-sports/mighty-quinn-sept-23-article-1.1165584
http://www.nydailynews.com/sports/more-sports/jerry-king-lawler-stable-condition-suffering-heart-attack-wwe-raw-broadcast-monday-night-article-1.1156915
http://www.nydailynews.com/news/politics/ambassador-chris-stevens-breathing-libyans-found-american-consulate-rescue-article-1.1161454
http://www.nydailynews.com/news/crime/swiss-banker-bradley-birkenfeld-104-million-reward-irs-blowing-whistle-thousands-tax-dodgers-article-1.1156736
http://www.nydailynews.com/sports/hockey/nhl-board-governors-votes-favor-lockout-league-players-association-fail-reach-agreement-cba-article-1.1159131
http://www.nydailynews.com/news/national/iphone-5-works-t-network-article-1.1165543
http://www.nydailynews.com/sports/baseball/yankees/yankees-broadcasters-michael-kay-ken-singleton-opportunity-important-statement-article-1.1165479
http://www.nydailynews.com/news/national/boss-year-michigan-car-dealer-retires-employees-1-000-year-service-article-1.1156763
http://www.nydailynews.com/entertainment/tv-movies/hero-denzel-washington-clint-eastwood-article-1.1165538
http://www.nydailynews.com/sports/football/giants/ny-giants-secondary-roasted-tony-romo-dallas-cowboys-offense-article-1.1153055
http://www.nydailynews.com/news/national/hide-and-seek-tragedy-3-year-old-suffocates-hiding-bean-bag-article-1.1160138
I would try using the threading module; here is something I think is working:
from bs4 import BeautifulSoup, SoupStrainer
import threading
import urllib2
def fetch_url(url):
urlHandler = urllib2.urlopen(url)
html = urlHandler.read()
#this tells it to soup the page that is at the url above
soup = BeautifulSoup(html)
#this tells it to find the string in the first instance of each of the tags in the parenthesis
title = soup.find_all('title')
article = soup.find_all('article')
try:
title = str(title[0].get_text().encode('utf-8'))
except:
print "had to skip one bad title\n"
return
try:
article = str(article[0].get_text().encode('utf-8'))
except:
print "had to skip one bad article"
return
try:
# make the file using the things above
output_files_pathname = 'DailyNews/' # path where output will go
new_filename = title + ".txt"
# write each of the things defined into the text file
outfile = open(output_files_pathname + new_filename, 'w')
outfile.write(title)
outfile.write("\n")
outfile.write(article)
outfile.close()
print "%r added as a text file" % title
return
except:
print "had to skip one cant write file"
return
return
with open("dailynewsurls.txt", 'r') as urlfile:
# read one line at a time until end of file
threads = [threading.Thread(target=fetch_url, args=(url,)) for url in urlfile]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
Here i have written the code using python and beautiful soup to parse all the links on that page into a repository of links. Next, it fetches the contents of any of the url from the repository just created, parses the links from this new content into the repository and continues this process for all links in the repository until stopped or after a given number of links are fetched.
But this code is very slow. How can i improve it by using asynchronous programming using gevents in python ?
Code
class Crawler(object):
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup.BeautifulSoup(html_code)
page_links = []
try :
page_links = itertools.ifilter( # Only deal with absolute links
lambda href: 'http://' in href,
( a.get('href') for a in self.soup.findAll('a') ) )
except Exception as e: # Magnificent exception handling
print 'Error: ',e
pass
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
while len(self.visited_links) < 3 or (self.visited_links == self.links):
self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
Update 1
import gevent.monkey; gevent.monkey.patch_thread()
from bs4 import BeautifulSoup
import urllib2
import itertools
import random
import urlparse
import sys
import gevent.monkey; gevent.monkey.patch_all(thread=False)
class Crawler(object):
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup(html_code)
page_links = []
try :
for link in [h.get('href') for h in self.soup.find_all('a')]:
print "Found link: '" + link + "'"
if link.startswith('http'):
print 'entered in if link: ',link
page_links.append(link)
print "Adding link" + link + "\n"
elif link.startswith('/'):
print 'entered in elif link: ',link
parts = urlparse.urlparse(self.current_page)
page_links.append(parts.scheme + '://' + parts.netloc + link)
print "Adding link " + parts.scheme + '://' + parts.netloc + link + "\n"
else:
print 'entered in else link: ',link
page_links.append(self.current_page+link)
print "Adding link " + self.current_page+link + "\n"
except Exception, ex: # Magnificent exception handling
print ex
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
crawling_greenlets = []
for i in range(3):
crawling_greenlets.append(gevent.spawn(self.open))
gevent.joinall(crawling_greenlets)
#while len(self.visited_links) < 4 or (self.visited_links == self.links):
# self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
import gevent and make sure monkey-patching is done to make standard library calls non-blocking and aware of gevent:
import gevent
from gevent import monkey; monkey.patch_all()
(you can selectively decide what has to be monkey-patched, but let's say it is not
your problem at the moment)
In your run, make your open function to be called inside a greenlet. run can
return the greenlet object, so you can wait for it whenever you need to get the
results using gevent.joinall for example. Something like this:
def run(self):
return gevent.spawn(self.open)
c1 = Crawler()
c2 = Crawler()
c3 = Crawler()
crawling_tasks = [c.run() for c in (c1,c2,c3)]
gevent.joinall(crawling_tasks)
print [c.links for c in (c1, c2, c3)]
I made simple crawler using python and sqlite3. but there are some errors in the cmd screen. so I have searched this kind of error from stackoverflow.com. but i can't find the solution. some Q&A suggested me that I have to use ? instead of % on the sqlite command like SELECT COUNT(*) FROM urls WHERE url='%s'state=1"%url. but it was not working.
here is the error.
Traceback (most recent call last):
File "C:\Python27\crawl.py", line 239, in (module)
parseArticle( u )
File "C:\Python27\crawl.py", line 146, in parseArticle
gaterNeighborInfo(soup)
File "C:\Python27\crawl.py", line 68, in gaterNeighborInfo
if url and url.startswith('http://') and db.isCrawledURL(url)<1:
File "C:\Python27\crawl.py", line 217, in isCrawledURL
self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s'state=1"%url)
OperationalError: near "state": syntax error
As you see, this error seems to be hierarchical. but I don't know what is wrong and where this error starts.
here is the source code.
# -*- coding: utf-8 -*-
from BeautifulSoup import BeautifulSoup
import robotparser
import urllib2
import time, traceback, re, sys, os
import sqlite3
crawler_name = 'python_daum_crawler'
mainpage = 'http://blog.daum.net/'
mainpath = './data/'
# robot parser를 설정합니다.
rp = robotparser.RobotFileParser(mainpage + 'robot.txt')
rp.read()
def canFetch(url):
"수집 가능 여부를 체크합니다."
return rp.can_fetch(crawler_name, url)
def getContent(url, delay=1):
"웹문서를 다운로드 합니다."
time.sleep(delay)
if not canFetch(url):
# 웹마스터가 수집을 원치 않는 페이지는 수집을 하지 않습니다.
print('This url can NOT be fetched by our crawler :', url)
return None
try:
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', crawler_name)]
contents = opener.open(url).read()
except:
traceback.print_exc()
return None
return contents
def getArticleInfo(soup):
"daum blog 내의 article info를 얻어 옵니다."
rBlog = re.compile('.+blog.daum.net/|w+/|d+.*?')
URLs = soup('a',{'href':rBlog})
return [ u.get('href').split('?')[0] for u in URLs ]
def getOwnArticles(contents):
"해당 블로그에 포함되는 글의 목록을 가져옵니다."
ret = []
soup = BeautifulSoup(contents)
rBlog = re.compile('.+/BlogView.+')
for u in soup('a', {'href':rBlog}):
href = u.get('href')
article = href.split('articleno=')[1].split('&')[0]
if ret.count(article)<1:
ret.append(article)
return ret
def gatherNeighborInfo(soup):
"이웃 블로거/혹은 다녀간 블로거 정보를 수집합니다."
#daum blog 관련 주소를 찾습니다.
rBlog = re.compile('http://blog.daum.net/|w+')
Neighbors = soup('a',{'href':rBlog})
cnt = 0
for n in Neighbors:
url = n.get('href')
blogname = url.split('/')[-1]
if url and url.startswith('http://') and db.isCrawledURL(url)<1:
db.insertURL( url, 1 )
url2 = getRedirectedURL(url)
if not url2: continue
re_url = 'http://blog.daum.net' + url2
body = getContent(re_url, 0)
if body:
for u in getOwnArticles(body):
#자신의 글 주소를 db에 저장합니다.
fullpath = 'http://blog.daum.net/'+blogname+'/'+u
cnt += db.insertURL(fullpath)
if cnt>0: print('%d neighbor articles inserted'%cnt)
def getRedirectedURL(url):
"본문에 해당하는 프레임의 url을 얻어옵니다."
contents = getContent(url)
if not contents: return None
#redirect
try:
soup = BeautifulSoup(contents)
frame = soup('frame')
src = frame[0].get('src')
except:
src = None
return src
def getBody(soup, parent):
"본문 텍스트를 구합니다."
#본문 주소를 포함한 iframe을 찾습니다.
rSrc = re.compile('.+/ArticleContentsView.+')
iframe = soup('iframe',{'src':rSrc})
if len(iframe)>0:
src = iframe[0].get('src')
iframe_src = 'http://blog.daum.net'+src
#그냥 request하면 안 되고, referer를 지정해야 browser를 통해 요청한 것으로 인식합니다.
req = urllib2.Request(iframe_src)
req.add_header('Refere', parent)
body = urllib2.urlopen(req).read()
soup = BeautifulSoup(body)
return str(soup.body)
else:
print('NULL contents')
return ''
def parseArticle(url):
"해당 url을 parsing하고 저장합니다."
#blog id와 article id를 얻습니다.
article_id = url.split('/')[-1]
blog_id = url.split('/')[-2]
#redirect된 주소를 얻어 옵니다.
newURL = getRedirectedURL(url)
if newURL:
try:
#blog 디렉터리를 만듭니다.
os.mkdir(mainpath+blog_id)
except:
#디렉터리를 만들다 에러가 난 경우 무시합니다.
pass
newURL = 'http://blog.daum.net'+newURL
contents = getContent(newURL, 0)
if not contents:
print('Null Contents...')
#해당 url이 유효하지 않은 경우 에러(-1)로 표시합니다.
db.updateURL(url, -1)
return
#HTML을 파싱합니다.
soup = BeautifulSoup(contents)
#이웃 블로거 정보가 있나 확인합니다.
gatherNeighborInfo(soup)
#블로그 URL이 있을 경우 db에 삽입합니다.
n=0
for u in getArticleInfo(soup):
n += db.insertURL(u)
if n>0: print('inserted %d urls from %s'%(n,url))
#title을 얻습니다.
sp = contents.find('<title>')
if sp>-1:
ep = contents[sp+7:].find('<title>')
title = contents[sp+7:sp+ep+7]
else:
title = ''
#본문 HTML을 보기 쉽게 정리합니다.
contents = getBody(soup, newURL)
#script를 제거합니다.
pStyle = re.compile('<style(.*?)>(.*?)</style>', re.IGNORECASE | re.MULTILINE | re.DOTALL )
contents = pStyle.sub('', contents)
pStyle = re.compile('<script(.*?)>(.*?)</script>', re.IGNORECASE | re.MULTILINE | re.DOTALL )
contents = pStyle.sub('', contents)
pStyle = re.compile("<(.*?)>", re.IGNORECASE | re.MULTILINE | re.DOTALL )
contents = pStyle.sub('', contents)
#txt file을 저장합니다.
fTXT = open( mainpath + blog_id + '/' + article_id + '.txt', 'w')
fTXT.write( title+'|n')
fTXT.write(contents)
fTXT.close()
#처리했다고 db에 표시합니다.
db.updateURL(url)
else:
print('Invalid blog article...')
#해당 url이 유효하지 않은 경우 에러(-1)로 표시합니다.
db.updateURL(url, -1)
class DB:
"SQLITE3 wrapper class"
def __init__(self):
self.conn = sqlite3.connect('crawlerDB')
self.cursor = self.conn.cursor()
self.cursor.execute('CREATE TABLE IF NOT EXISTS urls(url text, state int)')
self.cursor.execute('CREATE UNIQUE INDEX IF NOT EXISTS IDX001 ON urls(url)')
self.cursor.execute('CREATE INDEX IF NOT EXISTS IDX002 ON urls(state)')
def __del__(self):
self.conn.commit()
self.cursor.close()
def insertURL(self, url, state=0):
try:
self.cursor.execute("INSERT INTO urls VALUES ('%s',%d)"%(url,state))
self.conn.commit()
except:
return 0
else:
return 1
def selectUncrawledURL(self):
self.cursor.execute('SELECT * FROM urls where state=0')
return [ row[0] for row in self.cursor.fetchall() ]
def updateURL(self, url, state=1):
self.cursor.execute("UPDATE urls SET state=%d WHERE url='%s'"%(state,url))
def isCrawledURL(self, url):
self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s'state=1"%url)
ret = self.cursor.fetchone()
return ret[0]
db = DB()
if __name__=='__main__':
print('starting crawl.py...')
#메인 페이지를 체크합니다.
contents = getContent(mainpage)
URLs = getArticleInfo( BeautifulSoup( contents ) )
nSuccess = 0
for u in URLs:
nSuccess += db.insertURL(u)
print('inserted %d new pages.'%nSuccess)
while 1:
for u in db.selectUncrawledURL():
#아직 읽지 않은 url을 얻어서 처리합니다.
print('downloading %s'%u)
try:
parseArticle( u )
except:
traceback.print_exc()
db.updateURL( u, -1 )
You are generating incorrect SQL; you probably want a url=... AND state=1 (with a space and AND to match both criteria.
Also, you should not use string interpolation, use SQL parameters instead:
def isCrawledURL(self, url):
self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url=? AND state=1", (url,))
ret = self.cursor.fetchone()
return ret[0]
This applies to all your queries, like:
self.cursor.execute("INSERT INTO urls VALUES (?, ?)", (url,state))
and:
self.cursor.execute("UPDATE urls SET state=? WHERE url=?", (state,url))
Note that the parameters are passed into the cursor.execute() calls as a second argument (a sequence of values).
You are missing a whitespace and an AND keyword before state in your query.
i am trying to add entries to a playlist in youtube via the code below. when i pass the playlist uri (http://gdata.youtube.com/feeds/api/users/nashrafeeg/playlists/0F4EF4B14F514476?client=Reddit+playlist+maker) to AddPlaylistVideoEntryToPlaylist method i get from the get playlist method i get error saying Invalid request URI. what is the best way to fix this ?
import urllib,re
import gdata.youtube
import gdata.youtube.service
class reddit():
def __init__(self, rssurl ='http://www.reddit.com/r/chillmusic.rss' ):
self.URL = rssurl
self._downloadrss()
def _downloadrss(self):
if self.URL.endswith('.rss'):
# Downloadd the RSS feed of the subreddit - save as "feed.rss"
try:
print "Downloading rss from reddit..."
urllib.urlretrieve (URL, "feed.rss")
except Exception as e:
print e
def clean(self):
playList = open("feed.rss").read()
links = re.findall(r'(http?://www.youtube.com\S+)', playList)
for link in links:
firstPass = link.replace('">[link]</a>', '')
secondPass = firstPass.replace('&fmt=18', '')
thirdpass = secondPass.replace('&feature=related', '')
finalPass = thirdpass.replace('http://www.youtube.com/watch?v=', '')
print thirdpass, "\t Extracted: ", finalPass
return finalPass
class google():
def __init__(self, username, password):
self.Username = username
self.password = password
#do not change any of the following
self.key = 'AI39si5DDjGYhG_1W-8n_amjgEjbOU27sa0aw2RQI5gOaoK5KqCD2Fzffbkh8oqGu7CqFQLLQ7N7wK0gz7lrTQbd70srC72Niw'
self.appname = 'Reddit playlist maker'
self.service = gdata.youtube.service.YouTubeService()
def authenticate(self):
self.service.email = self.Username
self.service.password = self.password
self.service.developer_key = self.key
self.service.client_id = self.appname
self.service.source = self.appname
self.service.ssl = False
self.service.ProgrammaticLogin()
def get_playlists(self):
y_playlist = self.service.GetYouTubePlaylistFeed(username='default')
l = []
k = []
for p in y_playlist.entry:
k=[]
k=[p.link[1].href, p.title.text]
l.append(k)
return l
def get_playlist_id_from_url(self, href):
#quick and dirty method to get the playList id's
return href.replace('http://www.youtube.com/view_play_list?p=','')
def creat_playlist(self, name="Reddit list", disc ="videos from reddit"):
playlistentry = self.service.AddPlaylist(name, disc)
if isinstance(playlistentry, gdata.youtube.YouTubePlaylistEntry):
print 'New playlist added'
return playlistentry.link[1].href
def add_video_to_playlist(self,playlist_uri,video):
video_entry = self.service.AddPlaylistVideoEntryToPlaylist(
playlist_uri, video)
if isinstance(video_entry, gdata.youtube.YouTubePlaylistVideoEntry):
print 'Video added'
URL = "http://www.reddit.com/r/chillmusic.rss"
r = reddit(URL)
g = google('xxxxx#gmail.com', 'xxxx')
g.authenticate()
def search_playlist(playlist="Reddit list3"):
pl_id = None
for pl in g.get_playlists():
if pl[1] == playlist:
pl_id = pl[0]
print pl_id
break
if pl_id == None:
pl_id = g.creat_playlist(name=playlist)
return pl_id
pls = search_playlist()
for video_id in r.clean():
g.add_video_to_playlist(pls, video_id)
Don't know how to get it but if you strip your playlist_uri of your '/users/[username]' it will work.
Example:
playlist_uri
http://gdata.youtube.com/feeds/api/users/[username]/playlists/[long_id]
Should become
http://gdata.youtube.com/feeds/api/playlists/[long_id]