I made simple crawler using python and sqlite3. but there are some errors in the cmd screen. so I have searched this kind of error from stackoverflow.com. but i can't find the solution. some Q&A suggested me that I have to use ? instead of % on the sqlite command like SELECT COUNT(*) FROM urls WHERE url='%s'state=1"%url. but it was not working.
here is the error.
Traceback (most recent call last):
File "C:\Python27\crawl.py", line 239, in (module)
parseArticle( u )
File "C:\Python27\crawl.py", line 146, in parseArticle
gaterNeighborInfo(soup)
File "C:\Python27\crawl.py", line 68, in gaterNeighborInfo
if url and url.startswith('http://') and db.isCrawledURL(url)<1:
File "C:\Python27\crawl.py", line 217, in isCrawledURL
self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s'state=1"%url)
OperationalError: near "state": syntax error
As you see, this error seems to be hierarchical. but I don't know what is wrong and where this error starts.
here is the source code.
# -*- coding: utf-8 -*-
from BeautifulSoup import BeautifulSoup
import robotparser
import urllib2
import time, traceback, re, sys, os
import sqlite3
crawler_name = 'python_daum_crawler'
mainpage = 'http://blog.daum.net/'
mainpath = './data/'
# robot parser를 설정합니다.
rp = robotparser.RobotFileParser(mainpage + 'robot.txt')
rp.read()
def canFetch(url):
"수집 가능 여부를 체크합니다."
return rp.can_fetch(crawler_name, url)
def getContent(url, delay=1):
"웹문서를 다운로드 합니다."
time.sleep(delay)
if not canFetch(url):
# 웹마스터가 수집을 원치 않는 페이지는 수집을 하지 않습니다.
print('This url can NOT be fetched by our crawler :', url)
return None
try:
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', crawler_name)]
contents = opener.open(url).read()
except:
traceback.print_exc()
return None
return contents
def getArticleInfo(soup):
"daum blog 내의 article info를 얻어 옵니다."
rBlog = re.compile('.+blog.daum.net/|w+/|d+.*?')
URLs = soup('a',{'href':rBlog})
return [ u.get('href').split('?')[0] for u in URLs ]
def getOwnArticles(contents):
"해당 블로그에 포함되는 글의 목록을 가져옵니다."
ret = []
soup = BeautifulSoup(contents)
rBlog = re.compile('.+/BlogView.+')
for u in soup('a', {'href':rBlog}):
href = u.get('href')
article = href.split('articleno=')[1].split('&')[0]
if ret.count(article)<1:
ret.append(article)
return ret
def gatherNeighborInfo(soup):
"이웃 블로거/혹은 다녀간 블로거 정보를 수집합니다."
#daum blog 관련 주소를 찾습니다.
rBlog = re.compile('http://blog.daum.net/|w+')
Neighbors = soup('a',{'href':rBlog})
cnt = 0
for n in Neighbors:
url = n.get('href')
blogname = url.split('/')[-1]
if url and url.startswith('http://') and db.isCrawledURL(url)<1:
db.insertURL( url, 1 )
url2 = getRedirectedURL(url)
if not url2: continue
re_url = 'http://blog.daum.net' + url2
body = getContent(re_url, 0)
if body:
for u in getOwnArticles(body):
#자신의 글 주소를 db에 저장합니다.
fullpath = 'http://blog.daum.net/'+blogname+'/'+u
cnt += db.insertURL(fullpath)
if cnt>0: print('%d neighbor articles inserted'%cnt)
def getRedirectedURL(url):
"본문에 해당하는 프레임의 url을 얻어옵니다."
contents = getContent(url)
if not contents: return None
#redirect
try:
soup = BeautifulSoup(contents)
frame = soup('frame')
src = frame[0].get('src')
except:
src = None
return src
def getBody(soup, parent):
"본문 텍스트를 구합니다."
#본문 주소를 포함한 iframe을 찾습니다.
rSrc = re.compile('.+/ArticleContentsView.+')
iframe = soup('iframe',{'src':rSrc})
if len(iframe)>0:
src = iframe[0].get('src')
iframe_src = 'http://blog.daum.net'+src
#그냥 request하면 안 되고, referer를 지정해야 browser를 통해 요청한 것으로 인식합니다.
req = urllib2.Request(iframe_src)
req.add_header('Refere', parent)
body = urllib2.urlopen(req).read()
soup = BeautifulSoup(body)
return str(soup.body)
else:
print('NULL contents')
return ''
def parseArticle(url):
"해당 url을 parsing하고 저장합니다."
#blog id와 article id를 얻습니다.
article_id = url.split('/')[-1]
blog_id = url.split('/')[-2]
#redirect된 주소를 얻어 옵니다.
newURL = getRedirectedURL(url)
if newURL:
try:
#blog 디렉터리를 만듭니다.
os.mkdir(mainpath+blog_id)
except:
#디렉터리를 만들다 에러가 난 경우 무시합니다.
pass
newURL = 'http://blog.daum.net'+newURL
contents = getContent(newURL, 0)
if not contents:
print('Null Contents...')
#해당 url이 유효하지 않은 경우 에러(-1)로 표시합니다.
db.updateURL(url, -1)
return
#HTML을 파싱합니다.
soup = BeautifulSoup(contents)
#이웃 블로거 정보가 있나 확인합니다.
gatherNeighborInfo(soup)
#블로그 URL이 있을 경우 db에 삽입합니다.
n=0
for u in getArticleInfo(soup):
n += db.insertURL(u)
if n>0: print('inserted %d urls from %s'%(n,url))
#title을 얻습니다.
sp = contents.find('<title>')
if sp>-1:
ep = contents[sp+7:].find('<title>')
title = contents[sp+7:sp+ep+7]
else:
title = ''
#본문 HTML을 보기 쉽게 정리합니다.
contents = getBody(soup, newURL)
#script를 제거합니다.
pStyle = re.compile('<style(.*?)>(.*?)</style>', re.IGNORECASE | re.MULTILINE | re.DOTALL )
contents = pStyle.sub('', contents)
pStyle = re.compile('<script(.*?)>(.*?)</script>', re.IGNORECASE | re.MULTILINE | re.DOTALL )
contents = pStyle.sub('', contents)
pStyle = re.compile("<(.*?)>", re.IGNORECASE | re.MULTILINE | re.DOTALL )
contents = pStyle.sub('', contents)
#txt file을 저장합니다.
fTXT = open( mainpath + blog_id + '/' + article_id + '.txt', 'w')
fTXT.write( title+'|n')
fTXT.write(contents)
fTXT.close()
#처리했다고 db에 표시합니다.
db.updateURL(url)
else:
print('Invalid blog article...')
#해당 url이 유효하지 않은 경우 에러(-1)로 표시합니다.
db.updateURL(url, -1)
class DB:
"SQLITE3 wrapper class"
def __init__(self):
self.conn = sqlite3.connect('crawlerDB')
self.cursor = self.conn.cursor()
self.cursor.execute('CREATE TABLE IF NOT EXISTS urls(url text, state int)')
self.cursor.execute('CREATE UNIQUE INDEX IF NOT EXISTS IDX001 ON urls(url)')
self.cursor.execute('CREATE INDEX IF NOT EXISTS IDX002 ON urls(state)')
def __del__(self):
self.conn.commit()
self.cursor.close()
def insertURL(self, url, state=0):
try:
self.cursor.execute("INSERT INTO urls VALUES ('%s',%d)"%(url,state))
self.conn.commit()
except:
return 0
else:
return 1
def selectUncrawledURL(self):
self.cursor.execute('SELECT * FROM urls where state=0')
return [ row[0] for row in self.cursor.fetchall() ]
def updateURL(self, url, state=1):
self.cursor.execute("UPDATE urls SET state=%d WHERE url='%s'"%(state,url))
def isCrawledURL(self, url):
self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s'state=1"%url)
ret = self.cursor.fetchone()
return ret[0]
db = DB()
if __name__=='__main__':
print('starting crawl.py...')
#메인 페이지를 체크합니다.
contents = getContent(mainpage)
URLs = getArticleInfo( BeautifulSoup( contents ) )
nSuccess = 0
for u in URLs:
nSuccess += db.insertURL(u)
print('inserted %d new pages.'%nSuccess)
while 1:
for u in db.selectUncrawledURL():
#아직 읽지 않은 url을 얻어서 처리합니다.
print('downloading %s'%u)
try:
parseArticle( u )
except:
traceback.print_exc()
db.updateURL( u, -1 )
You are generating incorrect SQL; you probably want a url=... AND state=1 (with a space and AND to match both criteria.
Also, you should not use string interpolation, use SQL parameters instead:
def isCrawledURL(self, url):
self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url=? AND state=1", (url,))
ret = self.cursor.fetchone()
return ret[0]
This applies to all your queries, like:
self.cursor.execute("INSERT INTO urls VALUES (?, ?)", (url,state))
and:
self.cursor.execute("UPDATE urls SET state=? WHERE url=?", (state,url))
Note that the parameters are passed into the cursor.execute() calls as a second argument (a sequence of values).
You are missing a whitespace and an AND keyword before state in your query.
Related
I am using Scrapy to crawl a website.
In code I am using more than one call back function, where data related to one search result will get retrived in two call back functions. Like,
class PubmedProjSpider(CrawlSpider):
name = str(CONFIG.get('project_name', 'project_name'))
start_urls = ['https://pubmed.ncbi.nlm.nih.gov/?term=(((((((((((((((((((((((((sodium%20oxybate%5BText%20Word%5D)%20OR%20(Xyrem%5BText%20Word%5D))%20OR%20(certolizumab%20pegol%5BText%20Word%5D))%20OR%20(Cimzia%5BText%20Word%5D))%20OR%20(vancomycin%20hydrochloride%5BText%20Word%5D))%20OR%20(Vancomycin%5BText%20Word%5D))%20OR%20(Vancocin%5BText%20Word%5D))%20OR%20(atorvastatin%20calcium%20trihydrate%5BText%20Word%5D))%20OR%20(atorvastatin%5BText%20Word%5D))%20OR%20(Lipitor))%20OR%20(alprostadil%5BText%20Word%5D))%20OR%20(Caverject%5BText%20Word%5D))%20OR%20(atenolol%5BText%20Word%5D))%20OR%20(Tenormin%5BText%20Word%5D))%20OR%20(tramadol%20hydrochloride%5BText%20Word%5D))%20OR%20(tramadol%5BText%20Word%5D))%20OR%20(Maneo%5BText%20Word%5D))%20OR%20(temazepam%5BText%20Word%5D))%20OR%20(citalopram%20hydrobromide%5BText%20Word%5D))%20OR%20(citalopram%5BText%20Word%5D))%20OR%20(Cipramil%5BText%20Word%5D))%20OR%20(fluticasone%20propionate%5BText%20Word%5D))%20OR%20(fluticasone%5BText%20Word%5D))%20OR%20(Cutivate%5BText%20Word%5D)))%20AND%20((%222020%2F03%2F03%22%5BDate%20-%20Create%5D%20%3A%20%222020%2F03%2F05%22%5BDate%20-%20Create%5D))&filter=simsearch2.ffrft&pos=6']
path = r"C:\Users\vighnesh.paramasivam\Documents\pubmed_organised_copy\pubmed_organised\pubmed\pubmed\output_pdf_files"
def __init__(self):
self.file_storage_location = CONFIG.get('storage_location', 'text_storage_destination')
def parse(self, response):
try:
hxs = Selector(response)
items = []
titles = hxs.xpath("//div[#class='docsum-wrap']//div[#class='docsum-content']")
items.append(titles)
for title in items:
for href in title.xpath("a/#href").extract():
yield Request(
url=response.urljoin(href),
callback=self.parse_article
)
if response.xpath("//button[#class='load-button next-page']"):
temp_url = response.xpath("//div[#data-next-page-url]/#data-next-page-url").getall()[0]
next_page_url = response.urljoin(temp_url)
next_page_url = next_page_url.replace('/more','')
yield Request(
url = next_page_url,
callback=self.parse)
except Exception as message:
#print("###### exception from parse method")
raise CloseSpider(message)
def parse_article(self, response):
try:
w={}
w['title'] = str(' '.join(response.xpath('.//h1[#class="heading-title"]')[0].xpath(".//text()").getall()).encode('utf-8').lstrip().rstrip())
w['url'] = str(response).split(' ')[-1].strip('>')
w['pmcid'] = str(response.xpath(".//ul/li/span[#class='identifier pubmed']/strong[#title='PubMed ID']/text()").getall()[0])
w['authors'] = response.xpath('//div[#class="inline-authors"]/div[#class="authors"]/div[#class="authors-list"]/span/a/text()').getall()
abstract = {'Free-Text':[]}
w['pdf_downloaded'] = 'No'
w['pdf_links'] = ''
q = response.xpath("//div[#class='abstract'][#id='abstract']").getall()
if response.xpath("//div[#class='full-text-links-list']/a/#href"):
w['pdf_links'] = list(set(response.xpath("//div[#class='full-text-links-list']/a/#href").getall()))
if q:
for i in response.xpath("//div[#class='abstract'][#id='abstract']/div[#class='abstract-content selected']/p"):
strong_format = i.xpath("./strong//text()").getall()
bold_format = i.xpath("./b/text()").getall()
if strong_format:
abstract[i.xpath("./strong//text()").getall()[0].strip().strip(':').lstrip()] = ' '.join(i.xpath("./text()").getall()).lstrip().rstrip()
elif bold_format:
headings = response.xpath("//div[#class='abstract'][#id='abstract']/div[#class='abstract-content selected']/p/b/text()").getall()
if headings:
if response.xpath('normalize-space(substring-before(string(//div[#class="abstract"][#id="abstract"]/div[#class="abstract-content selected"]/p),//b[.="{}"]))'.format(headings[0])).getall():
abstract['Free-Text'] = response.xpath('normalize-space(substring-before(string(//div[#class="abstract"][#id="abstract"]/div[#class="abstract-content selected"]/p),//b[.="{}"]))'.format(headings[0])).getall()[0]
for num, header in enumerate(headings):
if num != len(headings)-1:
abstract[header] = response.xpath('normalize-space(substring-before(substring-after(string(//div[#class="abstract"][#id="abstract"]/div[#class="abstract-content selected"]/p),//b[.="{}"]),//b[.="{}"]))'.format(headings[num], headings[num+1])).getall()[0]
else:
abstract[header] = response.xpath('normalize-space(substring-after(string(//div[#class="abstract"][#id="abstract"]/div[#class="abstract-content selected"]/p),//b[.="{}"]))'.format(headings[num])).getall()[0]
else:
abstract['Free-Text'].append((' '.join(i.xpath(".//text()").getall()).lstrip().rstrip()))
if response.xpath("//div[#class='abstract'][#id='abstract']/p/strong[contains(text(), 'Keywords:')]"):
abstract['Keywords']=' '.join(response.xpath("//div[#class='abstract'][#id='abstract']/p/text()").getall()).strip()
w['abstract'] = abstract
path = os.path.join(self.file_storage_location,'PMCID_'+w['pmcid']+'.txt')
with open(path, 'w') as e:
for p in w.items():
e.write("%s:%s\n\n" % p)
if 'PMC' in response.xpath(".//div[#class='full-text-links-list']/a/#data-ga-action").getall():
pdf_url = response.xpath(".//div[#class='full-text-links-list']/a[#data-ga-action='PMC']/#href").getall()[0]
#for href in response.css('a[href$=".pdf"]::attr(href)').extract():
yield Request(
url=response.urljoin(pdf_url),
callback=self.link, meta={'hero_item': w['pmcid']}
)
yield(w)
except Exception as message:
#print("###############Exception from parse_article")
raise CloseSpider(message)
def link(self, response):
print("################# entering link function")
try:
if response.xpath('.//div[#class="format-menu"]/ul/li/a[contains(text(), "PDF")]/#href'):
link1 = response.xpath('.//div[#class="format-menu"]/ul/li/a[contains(text(), "PDF")]/#href').getall()[0]
item = response.meta.get('hero_item')
yield Request(
url=response.urljoin(link1),
callback=self.save_pdf, meta={'hero_item': item}
)
except Exception as message:
#print("###############Exception from link")
pass
def save_pdf(self, response):
try:
print("################# entering pdf function")
item = response.meta.get('hero_item')
path = self.path + "\\"+ "PMCID_" + item + '.pdf'
self.logger.info('Saving PDF %s', path)
with open(path, 'wb') as f:
f.write(response.body)
except Exception as message:
pass
As in the above code all the details are getting extracted in "parse_article , but one information as of whether "pdf_downloaded" will be decided from save_pdf function which is a call back function.
Now the data is there in two call back functions, how can I append then before stroing them.
Any help is appreciated!!
u have tried to run my program , but each time im getting error in the middile of the run
basiclly, my program does this :
1. get the xml from my website
2. run all the urls
3. get data from my web page (sku,name,title, price etc)
4. get the lowest price from another website, by compraring the price with the same sku
the problem is that i have more then 7,000 urls in my xml ,so my program get error network each time
what to do ? how can i resolve it ?
def parse_sitemap (url):
resp = requests.get(XXXX)
for u in urls:
loc = u.find ('loc').string
# not a sitemap requirement skip if not present
out.append ([loc])
return out
def get_sku (u):
html = requests.get(u)
bsObj = BeautifulSoup(html.content,'xml')
sku = bsObj.find('span',attrs={'itemprop':'sku'}).get_text()
return sku
def get_price ( u):
try:
html = requests.get(u)
bsObj = BeautifulSoup(html.content,'xml')
price = bsObj.find('span',attrs={'itemprop':'price'}).get_text()
price = str(price).replace(' ₪','')
return price
except:
return 'no price'
def get_zapPrice (makat):
try:
check ='https://www.zap.co.il/search.aspx?keyword='+makat
r = requests.get(check)
html = requests.get(r.url)
bsObj = BeautifulSoup(html.content,'html.parser')
zapPrice = bsObj.select_one('div.StoresLines div.PriceNum').text.strip().replace(' ₪','')
return zapPrice
except:
return 'no zap product'
def get_zapStoreName (makat):
try:
check ='https://www.zap.co.il/search.aspx?keyword='+makat
r = requests.get(check)
html = requests.get(r.url)
bsObj = BeautifulSoup(html.content,'html.parser')
storeName = bsObj.select_one('div.StoresLines
div.BuyButtonsTxt').text.strip().replace('ב-','')
return storeName
except:
return 'no zap product'
for u in urls:
ws1 [ 'A1' ] = u
makat = get_sku(u)
ws1 [ 'F1' ] = makat
zapPrice = get_zapPrice(makat)
ws1['I1'] = zapPrice
storeName = get_zapStoreName(makat)
ws1['J1'] = storeName
ws1.insert_rows(1)
ws1.append ([])
print("writing product no." + str(i))
ws1['F1'] = 'makat'
ws1['I1'] = 'zap price'
ws1['J1'] = 'zap store'
wb.save ("sample.xlsx")
wb.close ()
print ('end')
i didn't write all my code - by the basic is here
each def it's start with requests.get, get what i want and return it
after that, i'm writing it to excel file
the problem that im getting after 1,000 urls checks ...
what is the problem ??
One of my friend was developing a scrapy script to scrap data from a page.
After sometime, I needed to add another field into. And I added the field successfully. But the problem is the field is not getting the data of the links inside the td. The field name is "Last Batsman"
Data URL:
http://digicricket.marssil.com/match/MatchData.aspx?op=1&match=1385
XPath of the Data:
//*[#id="ctl00_ContentPlaceHolder1_divData"]/table[6]/tr/td
import scrapy
from bs4 import BeautifulSoup
from scrapy.exceptions import CloseSpider
from scrapy.selector import Selector
from digicricket.items import ODIorTestItem
class DigicricketMarsilOp1Spider(scrapy.Spider):
name = "digicricket.marssil.op1"
allowed_domains = ["digicricket.marssil.com"]
def __init__(self, match_id=None):
if match_id:
match_id_list = match_id.split(',')
for i in match_id_list:
if not i.isdigit():
raise CloseSpider('Match ID = {0} is not a number'.format(i))
else:
self.start_urls = ['http://digicricket.marssil.com/match/MatchData.aspx?op=1&match={0}'.format(i)
for i in match_id_list]
else:
raise CloseSpider('You forgot input Match ID/IDs')
def parse(self, response):
item = ODIorTestItem()
item['Batsman_op1'] = []
item['Bowler_op1'] = []
item['other_op1'] = []
sel = Selector(response)
tables = sel.xpath('//div[#id="ctl00_ContentPlaceHolder1_divData"]/table').extract()
row_for_other = dict()
for i in xrange(len(tables)):
html_text = BeautifulSoup(tables[i])
if i == 1:
sl = 0
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if td:
sl += 1
row = dict()
row['sl'] = sl
row['match_id'] = response.url[response.url.rfind('=')+1:]
row["Batsman"] = td[0].get_text()
row["R"] = td[1].get_text()
row["B"] = td[2].get_text()
row["4s"] = td[3].get_text()
row["6s"] = td[4].get_text()
row["SR"] = td[5].get_text()
item['Batsman_op1'].append(row)
elif i == 2:
sl = 0
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if td:
sl += 1
row = dict()
row['sl'] = sl
row['match_id'] = response.url[response.url.rfind('=')+1:]
row["Bowler"] = td[0].get_text()
row["O"] = td[1].get_text()
row["M"] = td[2].get_text()
row["R"] = td[3].get_text()
row["W"] = td[4].get_text()
row["Econ"] = td[5].get_text()
item['Bowler_op1'].append(row)
else:
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if i == 0:
try:
row_for_other["InningsMatchDetails"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[1]').extract()[0]
except:
row_for_other["InningsMatchDetails"] = None
try:
row_for_other["CurrentScore"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/span/text()').extract()[0]
except:
row_for_other["CurrentScore"] = None
try:
row_for_other["OversRunRate"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[2]').extract()[0]
except:
row_for_other["OversRunRate"] = None
try:
row_for_other["Extras"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/table[1]/'
'tr/td/b/text()[3]').extract()[0]
except:
row_for_other["Extras"] = None
try:
row_for_other["MatchResult"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[4]').extract()[0]
except:
row_for_other["MatchResult"] = None
try:
row_for_other["RecentOvers"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[4]/tr/td[2]/text()').extract()[0]
except:
row_for_other["RecentOvers"] = None
try:
row_for_other["LastBatsman"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[6]/tr/td/text()').extract()[0]
except:
row_for_other["LastBatsman"] = None
row_for_other['match_id'] = response.url[response.url.rfind('=')+1:]
item['other_op1'].append(row_for_other)
return item
Your XPath seems to miss some tags. On the web page there are two div levels before the second table. Replacing / with // takes care of these. (Because my browser added some <tbody> tags there is also a double slash in front of the tr.
.//*[#id="ctl00_ContentPlaceHolder1_divData"]//table[6]//tr/td/a[1]/text()
Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
Questions asking for code must demonstrate a minimal understanding of the problem being solved. Include attempted solutions, why they didn't work, and the expected results. See also: Stack Overflow question checklist
Closed 9 years ago.
Improve this question
I'm making a simple crawling program with python. So, I used MySQL,Python. But when I executed this simple program, a error occurs. And then the contents that were crawled from web weren't updated on MySQL table. This error message shows ProgrammingError and syntax error. But I don't think I typed wrong code. Because there are HTML tag in the problem point. Why HTML tab occur in the error message. I think there are something problem between MySQL and Python. here is the error message.
Traceback (most recent call last):
File "crawl.py", line 237, in <module>
parseArticle( u )
File "crawl.py", line 166, in parseArticle
db.updateURL( url , contents )
File "crawl.py", line 206, in updateURL
self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url))
File "/usr/lib/python2.7/dist-packages/MySQLdb/cursors.py", line 174, in execute
self.errorhandler(self, exc, value)
File "/usr/lib/python2.7/dist-packages/MySQLdb/connections.py", line 36, in defaulterrorhandler
raise errorclass, errorvalue
ProgrammingError: (1064, 'You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near \'\xeb\x8f\x8b\xec\x9b\x80\', dotum, sans-serif; }\r\n\t//-->\r\n\t</style>\n<p style="TEXT-ALIGN: center\' at line 1')
And here is the source code. thank you for seeing my help.
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import robotparser
import urllib2
import time, traceback, re, sys, os
import MySQLdb
crawler_name = 'daum_blog_crawler'
mainpage = 'http://blog.daum.net/'
rp = robotparser.RobotFileParser( mainpage + 'robots.txt' )
rp.read()
def canFetch( url ):
return rp.can_fetch( crawler_name, url )
def getContent( url, delay=1):
time.sleep( delay )
if not canFetch( url ):
print 'This url can NOT be fetched by our crawler :', url
return None
try:
opener = urllib2.build_opener()
opener.addheaders = [('User-agent',crawler_name)]
contents = opener.open(url).read()
except:
traceback.print_exc()
return None
return contents
def getArticleInfo( soup ):
rBlog = re.compile('.+blog.daum.net/\w+/\d+.*?')
URLs = soup('a',{'href':rBlog})
return [ u.get('href').split('?')[0] for u in URLs ]
def getOwnArticles( contents ):
ret = []
soup = BeautifulSoup( contents )
rBlog = re.compile('.+/BlogTypeView.+')
for u in soup('a',{'href':rBlog}):
href = u.get('href')
article = href.split('articleno=')[1].split('&')[0]
if ret.count(article)<1:
ret.append( article )
return ret
def gatherNeighborInfo( soup ):
rBlog = re.compile('http://blog.daum.net/\w+')
Neighbors = soup('a',{'href':rBlog})
cnt = 0
for n in Neighbors:
url = n.get('href')
blogname = url.split('/')[-1]
if url and url.startswith('http://') and db.isCrawledURL(url)<1:
db.insertURL( url, 1 )
url2 = getRedirectedURL( url )
if not url2: continue
re_url = 'http://blog.daum.net' + url2
body = getContent( re_url, 0 )
if body:
for u in getOwnArticles( body ):
fullpath = 'http://blog.daum.net/'+blogname+'/'+u
cnt+=db.insertURL( fullpath )
if cnt>0: print '%d neighbor articles inserted'%cnt
def getRedirectedURL( url ):
contents = getContent( url )
if not contents: return None
#redirect
try:
soup = BeautifulSoup( contents )
frame = soup('frame')
src = frame[0].get('src')
except:
src = None
return src
def getBody( soup, parent ):
rSrc = re.compile('.+/ArticleContentsView.+')
iframe = soup('iframe',{'src':rSrc})
if len(iframe)>0:
src = iframe[0].get('src')
iframe_src = 'http://blog.daum.net'+src
req = urllib2.Request( iframe_src )
req.add_header('Referer', parent )
body = urllib2.urlopen(req).read()
soup = BeautifulSoup( body )
strbody= str(soup.body)
return strbody
else:
print 'NULL contents'
return ''
def parseArticle( url ):
article_id = url.split('/')[-1]
blog_id = url.split('/')[-2]
#for debugging, temp
if blog_id.isdigit():
print 'digit:', url.split('/')
newURL = getRedirectedURL( url )
if newURL:
newURL = 'http://blog.daum.net'+newURL
print 'redirecting', newURL
contents = getContent( newURL, 0 )
if not contents:
print 'Null Contents...'
db.updateURL( url, -1 )
return
soup = BeautifulSoup( contents )
gatherNeighborInfo( soup )
n=0
for u in getArticleInfo( soup ):
n+=db.insertURL( u )
if n>0: print 'inserted %d urls from %s'%(n,url)
sp = contents.find('<title>')
if sp>-1:
ep = contents[sp+7:].find('</title>')
title = contents[sp+7:sp+ep+7]
else:
title = ''
contents = getBody( soup, newURL )
db.updateURL( url , contents )
else:
print 'Invalid blog article...'
db.updateURL( url, 'None', -1 )
class DB:
"MySQL wrapper class"
def __init__(self):
self.conn = MySQLdb.connect(db='crawlDB', user='root', passwd='qltkd')
self.conn.query("set character_set_connection=utf8;")
self.conn.query("set character_set_server=utf8;")
self.conn.query("set character_set_client=utf8;")
self.conn.query("set character_set_results=utf8;")
self.conn.query("set character_set_database=utf8;")
self.cursor = self.conn.cursor()
self.cursor.execute('CREATE TABLE IF NOT EXISTS urls(url CHAR(150), state INT, content TEXT)')
def commit(self):
self.conn.commit()
def __del__(self):
self.conn.commit()
self.cursor.close()
def insertURL(self, url, state=0, content=None):
#'/' delete
if url[-1]=='/': url=url[:-1]
try:
self.cursor.execute("INSERT INTO urls VALUES ('%s',%d,'%s')"%(url,state,content))
except:
return 0
else:
return 1
def selectUncrawledURL(self):
self.cursor.execute("SELECT * FROM urls where state=0")
return [ row[0] for row in self.cursor.fetchall() ]
def updateURL(self, url, content, state=1):
if url[-1]=='/': url=url[:-1]
self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url))
def isCrawledURL(self, url):
if url[-1]=='/': url=url[:-1]
self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s' AND state=1"%url)
ret = self.cursor.fetchone()
return ret[0]
db = DB()
if __name__=='__main__':
print 'starting crawl.py...'
contents = getContent( mainpage )
URLs = getArticleInfo( BeautifulSoup( contents ) )
nSuccess = 0
for u in URLs:
nSuccess += db.insertURL( u )
print 'inserted %d new pages.'%nSuccess
while 1:
uncrawled_urls = db.selectUncrawledURL()
if not uncrawled_urls: break
for u in uncrawled_urls:
print 'downloading %s'%u
try:
parseArticle( u )
except:
traceback.print_exc()
db.updateURL( u, -1 )
db.commit()
#bs.UpdateIndex()
you can try;
self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,self.conn.escape_string(content),url))
I am new to python, and i am developing a web crawler below is the program which get the links from given url, but the problem is i dont want it to visit the same url which is already visited. please help me.
import re
import urllib.request
import sqlite3
db = sqlite3.connect('test2.db')
db.row_factory = sqlite3.Row
db.execute('drop table if exists test')
db.execute('create table test(id INTEGER PRIMARY KEY,url text)')
#linksList = []
#module to vsit the given url and get the all links in that page
def get_links(urlparse):
try:
if urlparse.find('.msi') ==-1: #check whether the url contains .msi extensions
htmlSource = urllib.request.urlopen(urlparse).read().decode("iso-8859-1")
#parsing htmlSource and finding all anchor tags
linksList = re.findall('<a href=(.*?)>.*?</a>',htmlSource) #returns href and other attributes of a tag
for link in linksList:
start_quote = link.find('"') # setting start point in the link
end_quote = link.find('"', start_quote + 1) #setting end point in the link
url = link[start_quote + 1:end_quote] # get the string between start_quote and end_quote
def concate(url): #since few href may return only /contact or /about so concatenating its baseurl
if url.find('http://'):
url = (urlparse) + url
return url
else:
return url
url_after_concate = concate(url)
# linksList.append(url_after_concate)
try:
if url_after_concate.find('.tar.bz') == -1: # skipping links which containts link to some softwares or downloads page
db.execute('insert or ignore into test(url) values (?)', [url_after_concate])
except:
print("insertion failed")
else:
return True
except:
print("failed")
get_links('http://www.python.org')
cursor = db.execute('select * from test')
for row in cursor: # retrieve the links stored in database
print (row['id'],row['url'])
urlparse = row['url']
# print(linksList)
# if urlparse in linksList == -1:
try:
get_links(urlparse) # again parse the link from database
except:
print ("url error")
Please suggest me the way how to solve the problem.
You should have a list of 'visited' pages. When you come to request the next url you can check whether the list already contains the url and if so skip it. I'm not a python programmer so here's some peusdo-code
Create listOfVisitedUrls
...
Start Loop
Get nextUrl
If nextUrl IsNotIn listOfVisitedUrls Then
Request nextUrl
Add nextUrl to listOfVisitedUrls
End If
Loop
You can use the following code:
import re
from urllib import urlopen
# Since few href may return only /contact or /about, concatenate to baseurl.
def concat(url, baseurl):
if url.find('http://'):
url = baseurl + url
return url
else:
return url
def get_links(baseurl):
resulting_urls = set()
try:
# Check whether the url contains .msi extensions.
if baseurl.find('.msi') == -1:
# Parse htmlSource and find all anchor tags.
htmlSource = urlopen(baseurl).read()
htmlSource = htmlSource.decode("iso-8859-1")
# Returns href and other attributes of a tag.
linksList = re.findall('<a href=(.*?)>.*?</a>',htmlSource)
for link in linksList:
# Setting start and end points in the link.
start_quote = link.find('"')
end_quote = link.find('"', start_quote + 1)
# Get the string between start_quote and end_quote.
url = link[start_quote + 1:end_quote]
url_after_concat = concat(url, baseurl)
resulting_urls.add(url_after_concat)
else:
return True
except:
print("failed")
return resulting_urls
get_links('http://www.python.org')
It will return a set() containing unique URLs for your baseurl; for `http://www.python.org', you should get:
set([u'http://www.python.org/download/',
u'http://docs.python.org/',
u'http://www.python.org#left-hand-navigation',
u'http://wiki.python.org/moin/PyQt',
u'http://wiki.python.org/moin/DatabaseProgramming/',
u'http://roundup.sourceforge.net/',
u'http://www.python.org/ftp/python/3.2.3/Python-3.2.3.tar.bz2',
u'http://www.python.org/about/website',
u'http://www.python.org/about/quotes',
u'http://www.python.org/community/jobs/',
u'http://www.python.org/psf/donations/',
u'http://www.python.org/about/help/',
u'http://wiki.python.org/moin/CgiScripts',
u'http://www.zope.org/',
u'http://www.pygame.org/news.html',
u'http://pypi.python.org/pypi',
u'http://wiki.python.org/moin/Python2orPython3',
u'http://www.python.org/download/releases/2.7.3/',
u'http://www.python.org/ftp/python/3.2.3/python-3.2.3.msi',
u'http://www.python.org/community/',
u'http://www.python.org/ftp/python/2.7.3/Python-2.7.3.tar.bz2',
u'http://wiki.python.org/moin/WebProgramming',
u'http://www.openbookproject.net/pybiblio/',
u'http://twistedmatrix.com/trac/',
u'http://wiki.python.org/moin/IntegratedDevelopmentEnvironments',
u'http://www.pentangle.net/python/handbook/',
u'http://wiki.python.org/moin/TkInter',
u'http://www.vrplumber.com/py3d.py',
u'http://sourceforge.net/projects/mysql-python',
u'http://wiki.python.org/moin/GuiProgramming',
u'http://www.python.org/about/',
u'http://www.edgewall.com/trac/',
u'http://osl.iu.edu/~lums/swc/',
u'http://www.python.org/community/merchandise/',
u"http://www.python.org'/psf/",
u'http://wiki.python.org/moin/WxPython',
u'http://docs.python.org/3.2/',
u'http://www.python.org#content-body',
u'http://www.python.org/getit/',
u'http://www.python.org/news/',
u'http://www.python.org/search',
u'http://www.python.org/community/sigs/current/edu-sig',
u'http://www.python.org/about/legal',
u'http://www.timparkin.co.uk/',
u'http://www.python.org/about/apps',
u'http://www.turbogears.org/',
u'http://www.egenix.com/files/python/mxODBC.html',
u'http://docs.python.org/devguide/',
u'http://docs.python.org/howto/sockets.html',
u'http://www.djangoproject.com/',
u'http://buildbot.net/trac',
u'http://www.python.org/psf/',
u'http://www.python.org/doc/',
u'http://wiki.python.org/moin/Languages',
u'http://www.xs4all.com/',
u'http://www.python.org/',
u'http://wiki.python.org/moin/NumericAndScientific',
u'http://www.python.org/channews.rdf',
u'http://www.alobbs.com/pykyra',
u'http://wiki.python.org/moin/PythonXml',
u'http://wiki.python.org/moin/PyGtk',
u'http://www.python.org/ftp/python/2.7.3/python-2.7.3.msi',
u'http://www.python.org/download/releases/3.2.3/',
u'http://www.python.org/3kpoll'])
Hope that helps.