My parser load only first href for list, but I need to load all href for list. This is my code:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
from lxml import html
import urllib.parse
import urllib.request
class VashMagaz(object):
RESULT = []
def parse_vashmagaz_run(self):
url = 'https://vashmagazin.ua/nerukhomist/kvartyry/'
r = requests.get(url)
res = html.fromstring(r.content)
result = res.xpath(u'//*[contains(text(), "120")]/#href')
num = self._get_page_num(result[0])
result = self.get_page_data(num)
return result
def get_page_data(self, num):
url = 'https://vashmagazin.ua/nerukhomist/kvartyry/?item_price1=&item_price2=&page={}'
for i in range(1, num):
r = requests.get(url.format(i))
self.get_all(r.content)
return self.RESULT
def _get_page_num(self,href):
result = urllib.parse.urlparse(href)
result = urllib.parse.parse_qs(result.query)
return int(result['page'][0])
def get_all(self, data):
data = self._get_desc(data)
for key, i in enumerate(data):
text = i.xpath('.//h3[#class="ner_h3"]/a/text()')[key]
href = i.xpath('.//h3[#class="ner_h3"]/a/#href')[key]
self.RESULT.append({'text': text,
'href': 'https://vashmagazin.ua/' + href,
})
def _get_desc(self, data):
return self.get_from_xpath(data, '//*[#id="price"]')
def get_from_xpath(self, data, xpath):
res = html.fromstring(data)
return res.xpath(xpath)
if __name__ == '__main__':
magaz = VashMagaz()
magaz.parse_vashmagaz_run()
msg = u'Subject: Квартири'+"\n"
for res in magaz.RESULT:
for k, i in res.items():
msg+=str(res[k]).strip()+"\n"
msg+='-------------------------------'+'\n'
print(msg)
Related
I have a function that scrapes href link form a particular page and returns the result. I want to call this function in a parallel way to save time. I have visited this problem Running same function for multiple files in parallel in python
But the challenge is that I need to save the return element in a list. How can I do that? Here is my code snippet.
url = "https://www.programmableweb.com/category/all/apis"
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data,'html.parser')
#function to scrape individual pages
def scrap_api_url(i):
print(i)
page_url = "https://www.programmableweb.com" + mid_url + '=' + str(i)
response = requests.get(page_url)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
all_api = soup.find_all('tr', class_ = re.compile('^(even|odd)$'))
return all_api
url_tag = soup.find('a',{'title' : 'Go to next page'})
mid_url = url_tag.get('href').split('=')[0]
threads=[]
#calling functions
if __name__ == '__main__':
inputs = [i for i in range(851)]
for item in inputs:
print('Thread Started :: ', item)
t = threading.Thread(target = scrap_api_url, args=(item,))
threads.append(t)
t.start()
h = []
for t in threads:
h.append(t.join())
You can use the ThreadPoolExecutor map method:
import re
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
def main():
url = "https://www.programmableweb.com/category/all/apis"
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data,'html.parser')
url_tag = soup.find('a',{'title' : 'Go to next page'})
mid_url = url_tag.get('href').split('=')[0]
# function to scrape individual pages
def scrap_api_url(i):
print(i)
page_url = "https://www.programmableweb.com" + mid_url + '=' + str(i)
response = requests.get(page_url)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
all_api = soup.find_all('tr', class_=re.compile('^(even|odd)$'))
return all_api
inputs = [i for i in range(851)]
with ThreadPoolExecutor() as executor:
future_results = executor.map(scrap_api_url, inputs)
results = [result for result in future_results]
print(results)
#calling functions
if __name__ == '__main__':
main()
I've found the working solution to write python scrapy code for extracting url's present in sitemap of a site from here but don't know how to export the data to CSV file!
When I try to run scrapy crawl myspider -o mydata.csv it returns an empty csv file, but list of urls are getting printed on screen!
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import SitemapSpider
from scrapy.spiders import Spider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip, is_gzipped
import re
import requests
class GetpagesfromsitemapSpider(SitemapSpider):
name = "myspider"
handle_httpstatus_list = [404]
def parse(self, response):
print(response.url)
def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.body):
yield Request(url, callback=self._parse_sitemap)
else:
body = self._get_sitemap_body(response)
if body is None:
self.logger.info('Ignoring invalid sitemap: %s', response.url)
return
s = Sitemap(body)
sites = []
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
sites.append(loc)
break
print(sites)
def __init__(self, spider=None, *a, **kw):
super(GetpagesfromsitemapSpider, self).__init__(*a, **kw)
self.spider = spider
l = []
url = "http://www.example.com/"
resp = requests.head(url + "/sitemap.xml")
if (resp.status_code != 404):
l.append(resp.url)
else:
resp = requests.head(url + "/robots.txt")
if (resp.status_code == 200):
l.append(resp.url)
self.sitemap_urls = l
print(self.sitemap_urls)
def iterloc(it, alt=False):
for d in it:
yield d['loc']
# Also consider alternate URLs (xhtml:link rel="alternate")
if alt and 'alternate' in d:
for l in d['alternate']:
yield l
First, you aren't make any request with scrapy, also you're combining scrapy with requests, that i think it's not the best idea. Try to change __init__ to:
def start_requests(self):
l = []
url = "http://www.example.com"
l.append(url + '/sitemap.xml')
l.append(url + '/robots.txt')
for link in l:
yield Request(link, callback=self._parse_sitemap)
Also, your self._parse_sitemap SHOULD return dict-like or Request(not only your self._parse_sitemap, every function in your scrapy spider, see docs):
def _parse_sitemap(self, response):
# handle here status responses(200,401,etc)
body = self._get_sitemap_body(response)
if body is None:
self.logger.info('Ignoring invalid sitemap: %s', response.url)
return
s = Sitemap(body)
sites = {} # You should return a dict-like item!
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
sites.append(loc)
break
yield sites # Change print to yield!, this is the way to populate your .csv file
The whole file(probably doesn't work, but explains the idea):
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import SitemapSpider
from scrapy.spiders import Spider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip, is_gzipped
import re
import requests
class GetpagesfromsitemapSpider(SitemapSpider):
name = "myspider"
handle_httpstatus_list = [404]
def parse(self, response):
print(response.url)
def _parse_sitemap(self, response):
# handle here status responses(200,401,etc)
body = self._get_sitemap_body(response)
if body is None:
self.logger.info('Ignoring invalid sitemap: %s', response.url)
return
s = Sitemap(body)
sites = {} # You should return a dict-like item!
if s.type == 'sitemapindex':
for loc in iterloc(s, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
for r, c in self._cbs:
if r.search(loc):
sites.append(loc)
break
yield sites # Change print to yield!, this is the way to populate your .csv file
def start_requests(self):
l = []
url = "http://www.example.com"
l.append(url + '/sitemap.xml')
l.append(url + '/robots.txt')
for link in l:
yield Request(link, callback=self._parse_sitemap)
def iterloc(it, alt=False):
for d in it:
yield d['loc']
# Also consider alternate URLs (xhtml:link rel="alternate")
if alt and 'alternate' in d:
for l in d['alternate']:
yield l
I'm trying to get some datas from a javascript webpage. My code is generating multiple links and parsing them one by one. Parsing outputs are lists. I have written this code with help from here. But it produces the lists inside a class. I want to insert list items into an sqlite table, and because of this I want to make the local list items global. I've tried to create a global list, put it into the class, and then append to it and return it. I've tried to directly insert them into the database from the processCurrentPage method and tried to create a list under the class and reach it by Webpage.list. But none of these methods worked. One of my attempts is here, but not the best one - it's only an example. I've tried many alternatives like this. Can you suggest a good way to handle it please?
P.S: I am new at Python, but researching it for whole two days, and read all class documentation, but couldn't find a way.
import sys
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
import requests
from bs4 import BeautifulSoup
import bs4 as bs
class WebPage(QtWebEngineWidgets.QWebEnginePage):
alldatas=[]
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext
#property
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
url = self.url().toString()
# do stuff with html...
soup = bs.BeautifulSoup(html, 'html.parser')
data = soup.find('div', class_='tablo_dual_board')
data1 = data.text
data2 = data1.splitlines()
self.alldatas+=data2
if not self.fetchNext:
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, QWebEnginePage_JavaScriptConsoleMessageLevel, p_str, p_int, p_str_1):
# disable javascript error output
pass
if __name__ == '__main__':
# generate some test urls
onexurl = "https://1xbahis1.com/en/live/Football/"
r = requests.get(onexurl)
soup = BeautifulSoup(r.content, "html.parser")
income = soup.find_all("ul", {"id":"games_content"})
links = soup.find_all("a", {"class": "c-events__name"})
urls = []
for matchlink in links:
urls.append("https://1xbahis1.com/en/"+(matchlink.get("href")))
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(urls)
print(webpage.alldatas)
sys.exit(app.exec_())
Below is a version of your script that should do what you want. The scrape_page function is called for each url that is processed, and the data is added to a global records list. The process_records function is called once after all the pages have been scraped. You can use this function to add the records to your database.
import sys
import requests
from bs4 import BeautifulSoup
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
records = []
def scrape_page(url, html):
print('scrape page:', url)
soup = BeautifulSoup(html, 'html.parser')
data = soup.find('div', class_='tablo_dual_board')
if data is not None:
records.append(data.text.splitlines())
else:
print('error: could not find tablo_dual_board')
def process_records():
# add record to database ...
print('process records:', len(records))
def generate_urls():
onexurl = "https://1xbahis1.com/en/live/Football/"
reply = requests.get(onexurl)
soup = BeautifulSoup(reply.content, "html.parser")
income = soup.find_all("ul", {"id":"games_content"})
links = soup.find_all("a", {"class": "c-events__name"})
urls = []
for matchlink in links:
urls.append("https://1xbahis1.com/en/"+(matchlink.get("href")))
return urls
class WebPage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
scrape_page(self.url().toString(), html)
if not self.fetchNext():
process_records()
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, QWebEnginePage_JavaScriptConsoleMessageLevel, p_str, p_int, p_str_1):
# disable javascript error output
pass
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(generate_urls())
sys.exit(app.exec_())
I have the following code which scrapes a website for divs with the class "odd" or "even". I'd like to make "odd" and "even" an argument my function takes in, which would allow me to add other divs as well. Here is my code:
#
# Imports
#
import urllib2
from bs4 import BeautifulSoup
import re
import os
from pprint import pprint
#
# library
#
def get_soup(url):
page = urllib2.urlopen(url)
contents = page.read()
soup = BeautifulSoup(contents, "html.parser")
body = soup.findAll("tr", ["even", "odd"])
string_list = str([i for i in body])
return string_list
def save_to_file(path, soup):
with open(path, 'w') as fhandle:
fhandle.write(soup)
#
# script
#
def main():
url = r'URL GOES HERE'
path = os.path.join('PATH GOES HERE')
the_soup = get_soup(url)
save_to_file(path, the_soup)
if __name__ == '__main__':
main()
I'd like to incorporate *args into the code so the get_soup function would look like this:
def get_soup(url, *args):
page = urllib2.urlopen(url)
contents = page.read()
soup = BeautifulSoup(contents, "html.parser")
body = soup.findAll("tr", [args])
string_list = str([i for i in body])
return string_list
def main():
url = r'URL GOES HERE'
path = os.path.join('PATH GOES HERE')
the_soup = get_soup(url, "odd", "even")
save_to_file(path, the_soup)
Unfortunately, this isn't working. Ideas?
Don't put args in a list, args is already a tuple so just pass that:
body = soup.findAll("tr", args)
If you [args], you would end up with something like [("odd","even")].
Also str([i for i in body]) makes no real sense, it would be the same as just doing str(body) but I don't see how that format could be useful.
I made simple crawler using python and sqlite3. but there are some errors in the cmd screen. so I have searched this kind of error from stackoverflow.com. but i can't find the solution. some Q&A suggested me that I have to use ? instead of % on the sqlite command like SELECT COUNT(*) FROM urls WHERE url='%s'state=1"%url. but it was not working.
here is the error.
Traceback (most recent call last):
File "C:\Python27\crawl.py", line 239, in (module)
parseArticle( u )
File "C:\Python27\crawl.py", line 146, in parseArticle
gaterNeighborInfo(soup)
File "C:\Python27\crawl.py", line 68, in gaterNeighborInfo
if url and url.startswith('http://') and db.isCrawledURL(url)<1:
File "C:\Python27\crawl.py", line 217, in isCrawledURL
self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s'state=1"%url)
OperationalError: near "state": syntax error
As you see, this error seems to be hierarchical. but I don't know what is wrong and where this error starts.
here is the source code.
# -*- coding: utf-8 -*-
from BeautifulSoup import BeautifulSoup
import robotparser
import urllib2
import time, traceback, re, sys, os
import sqlite3
crawler_name = 'python_daum_crawler'
mainpage = 'http://blog.daum.net/'
mainpath = './data/'
# robot parser를 설정합니다.
rp = robotparser.RobotFileParser(mainpage + 'robot.txt')
rp.read()
def canFetch(url):
"수집 가능 여부를 체크합니다."
return rp.can_fetch(crawler_name, url)
def getContent(url, delay=1):
"웹문서를 다운로드 합니다."
time.sleep(delay)
if not canFetch(url):
# 웹마스터가 수집을 원치 않는 페이지는 수집을 하지 않습니다.
print('This url can NOT be fetched by our crawler :', url)
return None
try:
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', crawler_name)]
contents = opener.open(url).read()
except:
traceback.print_exc()
return None
return contents
def getArticleInfo(soup):
"daum blog 내의 article info를 얻어 옵니다."
rBlog = re.compile('.+blog.daum.net/|w+/|d+.*?')
URLs = soup('a',{'href':rBlog})
return [ u.get('href').split('?')[0] for u in URLs ]
def getOwnArticles(contents):
"해당 블로그에 포함되는 글의 목록을 가져옵니다."
ret = []
soup = BeautifulSoup(contents)
rBlog = re.compile('.+/BlogView.+')
for u in soup('a', {'href':rBlog}):
href = u.get('href')
article = href.split('articleno=')[1].split('&')[0]
if ret.count(article)<1:
ret.append(article)
return ret
def gatherNeighborInfo(soup):
"이웃 블로거/혹은 다녀간 블로거 정보를 수집합니다."
#daum blog 관련 주소를 찾습니다.
rBlog = re.compile('http://blog.daum.net/|w+')
Neighbors = soup('a',{'href':rBlog})
cnt = 0
for n in Neighbors:
url = n.get('href')
blogname = url.split('/')[-1]
if url and url.startswith('http://') and db.isCrawledURL(url)<1:
db.insertURL( url, 1 )
url2 = getRedirectedURL(url)
if not url2: continue
re_url = 'http://blog.daum.net' + url2
body = getContent(re_url, 0)
if body:
for u in getOwnArticles(body):
#자신의 글 주소를 db에 저장합니다.
fullpath = 'http://blog.daum.net/'+blogname+'/'+u
cnt += db.insertURL(fullpath)
if cnt>0: print('%d neighbor articles inserted'%cnt)
def getRedirectedURL(url):
"본문에 해당하는 프레임의 url을 얻어옵니다."
contents = getContent(url)
if not contents: return None
#redirect
try:
soup = BeautifulSoup(contents)
frame = soup('frame')
src = frame[0].get('src')
except:
src = None
return src
def getBody(soup, parent):
"본문 텍스트를 구합니다."
#본문 주소를 포함한 iframe을 찾습니다.
rSrc = re.compile('.+/ArticleContentsView.+')
iframe = soup('iframe',{'src':rSrc})
if len(iframe)>0:
src = iframe[0].get('src')
iframe_src = 'http://blog.daum.net'+src
#그냥 request하면 안 되고, referer를 지정해야 browser를 통해 요청한 것으로 인식합니다.
req = urllib2.Request(iframe_src)
req.add_header('Refere', parent)
body = urllib2.urlopen(req).read()
soup = BeautifulSoup(body)
return str(soup.body)
else:
print('NULL contents')
return ''
def parseArticle(url):
"해당 url을 parsing하고 저장합니다."
#blog id와 article id를 얻습니다.
article_id = url.split('/')[-1]
blog_id = url.split('/')[-2]
#redirect된 주소를 얻어 옵니다.
newURL = getRedirectedURL(url)
if newURL:
try:
#blog 디렉터리를 만듭니다.
os.mkdir(mainpath+blog_id)
except:
#디렉터리를 만들다 에러가 난 경우 무시합니다.
pass
newURL = 'http://blog.daum.net'+newURL
contents = getContent(newURL, 0)
if not contents:
print('Null Contents...')
#해당 url이 유효하지 않은 경우 에러(-1)로 표시합니다.
db.updateURL(url, -1)
return
#HTML을 파싱합니다.
soup = BeautifulSoup(contents)
#이웃 블로거 정보가 있나 확인합니다.
gatherNeighborInfo(soup)
#블로그 URL이 있을 경우 db에 삽입합니다.
n=0
for u in getArticleInfo(soup):
n += db.insertURL(u)
if n>0: print('inserted %d urls from %s'%(n,url))
#title을 얻습니다.
sp = contents.find('<title>')
if sp>-1:
ep = contents[sp+7:].find('<title>')
title = contents[sp+7:sp+ep+7]
else:
title = ''
#본문 HTML을 보기 쉽게 정리합니다.
contents = getBody(soup, newURL)
#script를 제거합니다.
pStyle = re.compile('<style(.*?)>(.*?)</style>', re.IGNORECASE | re.MULTILINE | re.DOTALL )
contents = pStyle.sub('', contents)
pStyle = re.compile('<script(.*?)>(.*?)</script>', re.IGNORECASE | re.MULTILINE | re.DOTALL )
contents = pStyle.sub('', contents)
pStyle = re.compile("<(.*?)>", re.IGNORECASE | re.MULTILINE | re.DOTALL )
contents = pStyle.sub('', contents)
#txt file을 저장합니다.
fTXT = open( mainpath + blog_id + '/' + article_id + '.txt', 'w')
fTXT.write( title+'|n')
fTXT.write(contents)
fTXT.close()
#처리했다고 db에 표시합니다.
db.updateURL(url)
else:
print('Invalid blog article...')
#해당 url이 유효하지 않은 경우 에러(-1)로 표시합니다.
db.updateURL(url, -1)
class DB:
"SQLITE3 wrapper class"
def __init__(self):
self.conn = sqlite3.connect('crawlerDB')
self.cursor = self.conn.cursor()
self.cursor.execute('CREATE TABLE IF NOT EXISTS urls(url text, state int)')
self.cursor.execute('CREATE UNIQUE INDEX IF NOT EXISTS IDX001 ON urls(url)')
self.cursor.execute('CREATE INDEX IF NOT EXISTS IDX002 ON urls(state)')
def __del__(self):
self.conn.commit()
self.cursor.close()
def insertURL(self, url, state=0):
try:
self.cursor.execute("INSERT INTO urls VALUES ('%s',%d)"%(url,state))
self.conn.commit()
except:
return 0
else:
return 1
def selectUncrawledURL(self):
self.cursor.execute('SELECT * FROM urls where state=0')
return [ row[0] for row in self.cursor.fetchall() ]
def updateURL(self, url, state=1):
self.cursor.execute("UPDATE urls SET state=%d WHERE url='%s'"%(state,url))
def isCrawledURL(self, url):
self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s'state=1"%url)
ret = self.cursor.fetchone()
return ret[0]
db = DB()
if __name__=='__main__':
print('starting crawl.py...')
#메인 페이지를 체크합니다.
contents = getContent(mainpage)
URLs = getArticleInfo( BeautifulSoup( contents ) )
nSuccess = 0
for u in URLs:
nSuccess += db.insertURL(u)
print('inserted %d new pages.'%nSuccess)
while 1:
for u in db.selectUncrawledURL():
#아직 읽지 않은 url을 얻어서 처리합니다.
print('downloading %s'%u)
try:
parseArticle( u )
except:
traceback.print_exc()
db.updateURL( u, -1 )
You are generating incorrect SQL; you probably want a url=... AND state=1 (with a space and AND to match both criteria.
Also, you should not use string interpolation, use SQL parameters instead:
def isCrawledURL(self, url):
self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url=? AND state=1", (url,))
ret = self.cursor.fetchone()
return ret[0]
This applies to all your queries, like:
self.cursor.execute("INSERT INTO urls VALUES (?, ?)", (url,state))
and:
self.cursor.execute("UPDATE urls SET state=? WHERE url=?", (state,url))
Note that the parameters are passed into the cursor.execute() calls as a second argument (a sequence of values).
You are missing a whitespace and an AND keyword before state in your query.