Scrapy Callback Function not scraping the Entire Data? - python

First of all this is my code-:
from twisted.internet import reactor
from scrapy.crawler import CrawlerProcess, CrawlerRunner
import scrapy
#from scrapy import log, signals
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
import datetime
from multiprocessing import Process, Queue
import os
from scrapy.http import Request
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from scrapy.signalmanager import SignalManager
import re
#query=raw_input("Enter a product to search for= ")
query='apple'
query1=query.replace(" ", "+")
class DmozItem(scrapy.Item):
productname = scrapy.Field()
product_link = scrapy.Field()
current_price = scrapy.Field()
mrp = scrapy.Field()
offer = scrapy.Field()
imageurl = scrapy.Field()
outofstock_status = scrapy.Field()
add = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["http://www.bestmercato.com"]
def start_requests(self):
task_urls = [
]
i=1
for i in range(1,2):
temp=("https://www.bestmercato.com/index.php?route=product/search&search="+query1+"&page="+str(i))
task_urls.append(temp)
i=i+1
start_urls = (task_urls)
# p=len(task_urls)
return [ Request(url = start_url) for start_url in start_urls ]
def parse(self, response):
items = []
for sel in response.xpath('//html/body/div/div/div[4]/div/div/div[5]/div'):
item = DmozItem()
item['productname'] = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[#class="name"]/a/text()').extract())[3:-2]
item['product_link'] = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[#class="name"]/a/#href').extract())[3:-2]
point1 = sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[4]').extract()
point = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[4]/#class').extract())[3:-2]
checker = "options" in point
item['current_price'] = ""
if checker:
i=1
p=1
while i==1:
t = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[4]/div/select/option['+str(p)+']/text()').extract())[3:-2]
#print t
if 'Rs' not in t:
i = 2
elif 'Rs' in t:
i = 1
t= " ".join(t)
s = t.translate(None, '\ t')[:-2]
item['current_price'] = item['current_price'] + ' ; ' + s
p = p+1
item['current_price'] = item['current_price'][3:-3]
else:
item['current_price'] = 'Rs. ' + str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[not (#class="name") or not(#class="description") or not(#class="qty") or not(#class="box_btn_icon")]/text()').extract())[46:-169]
re.findall(r"[-+]?\d*\.\d+|\d+", item["current_price"])
try:
test1 = str(sel.xpath('div/div[2]/div[3]/span[1]/text()').extract())[3:-2]
_digits = re.compile('\d')
if bool(_digits.search(test1)):
print 'hi'
test1=test1[:2]+'. '+test1[3:]
item['mrp'] = test1
#item['mrp'][2:2]='.'
test2 = str(sel.xpath('div/div[2]/div[3]/span[2]/text()').extract())[3:-2]
test2=test2[:2]+'. '+test2[3:]
item['current_price']=test2
else:
item['mrp'] = item['current_price']
except:
item['mrp'] = item['current_price']
item['offer'] = 'No additional offer available'
item['imageurl'] = str(sel.xpath('div[#class="product-thumb"]/div[#class="image"]/a[not (#class="sft_quickshop_icon")]/img[#class="img-responsive"]/#src').extract())[3:-2]
item['outofstock_status'] = str('In Stock')
request = Request(str(item['product_link']),callback=self.parse2, dont_filter=True)
request.meta['item'] = item
# print item
items.append(item)
return request
print (items)
def parse2(self, response):
item = response.meta['item']
item['add'] = response.url
return item
spider1 = DmozSpider()
settings = Settings()
settings.set("PROJECT", "dmoz")
settings.set("CONCURRENT_REQUESTS" , 100)
#)
#settings.set( "DEPTH_PRIORITY" , 1)
#settings.set("SCHEDULER_DISK_QUEUE" , "scrapy.squeues.PickleFifoDiskQueue")
#settings.set( "SCHEDULER_MEMORY_QUEUE" , "scrapy.squeues.FifoMemoryQueue")
crawler = CrawlerProcess(settings)
crawler.crawl(spider1)
crawler.start()
Now, these are the issues that I am facing.
1. There are numerous divs that can be found with this xpath - '//html/body/div/div/div[4]/div/div/div[5]/div' . However, the above code scrapes the contents only of the first div , i.e , having the xpath 'html/body/div/div/div[4]/div/div/div[5]/div[1]' , and not all of them.
The moment I comment these three lines, the scraper scrapes everything, obviously then I am not able to add the 'add' field in the item-:
request = Request(str(item['product_link']),callback=self.parse2, dont_filter=True)
request.meta['item'] = item
return request
So, I want to scrape all the divs , in addition with the 'add' field in my item Class (notice the class DmozItem). How do I do that? Please give a corrected code for my SPECIFIC case, it would be best that way!
2. Secondly, as I said, as I comment the three lines, that I mentioned above, then the program scrapes everything in a time close to 5 seconds (around 4.9 seconds).
But as soon as I un-comment, those 3 lines (again those that I mentioned above), the program's run-time exceeds drastically, and it runs in a time close to 9 seconds (around 8.8 - 8.9 seconds). Why does this happen? Is that because of this - dont_filter=True? Please suggest ways to overcome this, as the run-time can prove to be a very big problem for me. Also, can I decrease the initial time of 5 seconds (around 4.9) somehow?

Use html/body/div/div/div[4]/div/div/div[5]//div to get all divs after div[5].
EDIT:
This is the correct xpath - //html/body/div/div/div[4]/div/div/div[5]/div, that gave all the div's after div[5]. The previous one mentioned, gave multiple errors!
If you do a return statement inside the loop you end the whole method execution. So if you enable those three lines you end the execution of your method (and the for loop) after the first element.
This means you should yield your request instead of returning it.

Related

Result is not saved in json

I am using scrapy and running this script:
import scrapy
from ..items import SizeerItem
from scrapy.http.request import Request
class SizeerSpiderSpider(scrapy.Spider):
name = 'sizeer'
pg = 0
currentPg = 2
start_urls = [
'https://sizeer.lt/moterims'
]
def parse(self, response):
items = SizeerItem()
pages = response.xpath("//nav[#class='m-pagination']//span[3]/text()").extract()
pages = list(dict.fromkeys(pages))
if self.pg == 0:
pages = list(int(s) for s in pages[0].split() if s.isdigit())
self.pg = pages[0]
name = response.xpath("//div[#class='b-productList_content']//a/#href").extract()
items['name'] = list(dict.fromkeys(name))
while self.currentPg <= self.pg:
url = response.request.url + "?sort=default&limit=60&page=" + str(self.currentPg)
self.currentPg += 1
yield Request(url, callback=self.parse)
This way:
scrapy crawl sizeer -s FEED_URI='mydata.json' -s FEED_FORMAT=json
But after that my mydata.json is empty. This is my first time trying to 'play' with it and can't really understand where is the issue.
You also need to yield the items you scrape so Scrapy Engine will run them through the pipelines and thorugh the Feed Export (which is what you need to export to the file).
Since yield is non-blocking you can add just after populating it and the function will still yield your requests after:
...
name = response.xpath("//div[#class='b-productList_content']//a/#href").extract()
items['name'] = list(dict.fromkeys(name))
yield items # <<< Here for example
while self.currentPg <= self.pg:
...
As #yordan pointed out, you can simplify the way you are executing the spider like this: (However it's not the solution to the problem)
scrapy crawl sizeer -o mydata.json
Try this one:
Scrapy use item and save data in a json file
Pay attention to the yielding and the calling of the spider.

Running scrapy tasks in a loop

I have this code:
from logging import INFO
import scrapy
class LinkedInAnonymousSpider(scrapy.Spider):
name = "linkedin_anonymous"
allowed_domains = ["linkedin.com"]
start_urls = []
base_url = "https://www.linkedin.com/pub/dir/?first=%s&last=%s&search=Search"
def __init__(self, input=None, first=None, last=None):
self.input = input # source file name
self.first = first
self.last = last
def start_requests(self):
if self.first and self.last: # taking input from command line parameters
url = self.base_url % (self.first, self.last)
yield self.make_requests_from_url(url)
elif self.input: # taking input from file
i = 0
self.log('Input from file: %s' % self.input, INFO)
for line in open(self.input, 'r').readlines():
i += 1
if line.strip(): # no blank line
t = line.split("\t")
name = t[0]
parts = [n.strip() for n in name.split(' ')]
last = parts.pop()
first = " ".join(parts)
if first and last:
url = self.base_url % (first, last)
yield self.make_requests_from_url(url)
else:
raise Exception('No input.')
def parse(self, response):
# if there is exactly one match the person's profile page is returned
if response.xpath('//div[#class="profile-overview-content"]').extract():
yield scrapy.Request(response.url, callback=self.parse_full_profile_page)
else:
# extracting profile urls from search result
for sel in response.css('div.profile-card'):
url = sel.xpath('./*/h3/a/#href').extract()[0] # Person's full profile URL in LinkedIn
yield scrapy.Request(url, callback=self.parse_full_profile_page)
........
With this code, I get the profile details of a list of people from linkedin.
I have written such a main function in order to do that.
import scrapy
import sys
from linkedin_anonymous_spider import LinkedInAnonymousSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor
if __name__ == "__main__":
firstname = ['Hasan', 'James']
lastname = ['Arslan', 'Bond']
for a in range(len(firstname)):
settings = get_project_settings()
crawler = CrawlerProcess(settings)
spider = LinkedInAnonymousSpider()
crawler.crawl(spider, [], firstname[a], lastname[a])
crawler.start()
When the loop comes to the 2nd step, I get this error:
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
How can I fix the problem?
Thanks.
You can only run one reactor, so just calling crawler.start() once.
Try passing crawler.start() out of the loop.
Here is a correct version:
firstname = ['Hasan', 'James']
lastname = ['Arslan', 'Bond']
settings = get_project_settings()
crawler = CrawlerProcess(settings)
for a in range(len(firstname)):
crawler.crawl(LinkedInAnonymousSpider, [], firstname[a], lastname[a])
crawler.start()

Scrapy (Python): Iterating over 'next' page without multiple functions

I am using Scrapy to grab stock data from Yahoo! Finance.
Sometimes, I need to loop over several pages, 19 in this example , in order to get all of the stock data.
Previously (when I knew there would only be two pages), I would use one function for each page, like so:
def stocks_page_1(self, response):
returns_page1 = []
#Grabs data here...
current_page = response.url
next_page = current_page + "&z=66&y=66"
yield Request(next_page, self.stocks_page_2, meta={'returns_page1': returns_page1})
def stocks_page_2(self, response):
# Grab data again...
Now, instead of writing 19 or more functions, I was wondering if there was a way I could loop through an iteration using one function to grab all data from all pages available for a given stock.
Something like this:
for x in range(30): # 30 was randomly selected
current_page = response.url
# Grabs Data
# Check if there is a 'next' page:
if response.xpath('//td[#align="right"]/a[#rel="next"]').extract() != ' ':
u = x * 66
next_page = current_page + "&z=66&y={0}".format(u)
# Go to the next page somehow within the function???
Updated Code:
Works, but only returns one page of data.
class DmozSpider(CrawlSpider):
name = "dnot"
allowed_domains = ["finance.yahoo.com", "http://eoddata.com/"]
start_urls = ['http://finance.yahoo.com/q?s=CAT']
rules = [
Rule(LinkExtractor(restrict_xpaths='//td[#align="right"]/a[#rel="next"]'),
callback='stocks1',
follow=True),
]
def stocks1(self, response):
returns = []
rows = response.xpath('//table[#class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
values = cells[-1]
try:
float(values)
returns.append(values)
except ValueError:
continue
except ValueError:
continue
unformatted_returns = response.meta.get('returns_pages')
returns = [float(i) for i in returns]
global required_amount_of_returns, counter
if counter == 1 and "CAT" in response.url:
required_amount_of_returns = len(returns)
elif required_amount_of_returns == 0:
raise CloseSpider("'Error with initiating required amount of returns'")
counter += 1
print counter
# Iterator to calculate Rate of return
# ====================================
if data_intervals == "m":
k = 12
elif data_intervals == "w":
k = 4
else:
k = 30
sub_returns_amount = required_amount_of_returns - k
sub_returns = returns[:sub_returns_amount]
rate_of_return = []
if len(returns) == required_amount_of_returns or "CAT" in response.url:
for number in sub_returns:
numerator = number - returns[k]
rate = numerator/returns[k]
if rate == '':
rate = 0
rate_of_return.append(rate)
k += 1
item = Website()
items = []
item['url'] = response.url
item['name'] = response.xpath('//div[#class="title"]/h2/text()').extract()
item['avg_returns'] = numpy.average(rate_of_return)
item['var_returns'] = numpy.cov(rate_of_return)
item['sd_returns'] = numpy.std(rate_of_return)
item['returns'] = returns
item['rate_of_returns'] = rate_of_return
item['exchange'] = response.xpath('//span[#class="rtq_exch"]/text()').extract()
item['ind_sharpe'] = ((numpy.average(rate_of_return) - RFR) / numpy.std(rate_of_return))
items.append(item)
yield item
You see, a parse callback is just a function that takes the response and returns or yields either Items or Requests or both. There is no issue at all with reusing these callbacks, so you can just pass the same callback for every request.
Now, you could pass the current page info using the Request meta but instead, I'd leverage the CrawlSpider to crawl across every page. It's really easy, start generating the Spider with the command line:
scrapy genspider --template crawl finance finance.yahoo.com
Then write it like this:
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
Scrapy 1.0 has deprecated the scrapy.contrib namespace for the modules above, but if you're stuck with 0.24, use scrapy.contrib.linkextractors and scrapy.contrib.spiders.
from yfinance.items import YfinanceItem
class FinanceSpider(CrawlSpider):
name = 'finance'
allowed_domains = ['finance.yahoo.com']
start_urls = ['http://finance.yahoo.com/q/hp?s=PWF.TO&a=04&b=19&c=2005&d=04&e=19&f=2010&g=d&z=66&y=132']
rules = (
Rule(LinkExtractor(restrict_css='[rel="next"]'),
callback='parse_items',
follow=True),
)
LinkExtractor will pick up the links in the response to follow, but it can be limited with XPath (or CSS) and regular expressions. See documentation for more.
Rules will follow the links and call the callback on every response. follow=True will keep extracting links on every new response, but it can be limited by depth. See documentation again.
def parse_items(self, response):
for line in response.css('.yfnc_datamodoutline1 table tr')[1:-1]:
yield YfinanceItem(date=line.css('td:first-child::text').extract()[0])
Just yield the Items, since Requests for the next pages will be handled by the CrawlSpider Rules.

Problems with passing global variables in a Python scrapy project

In a Scrapy project I am doing, I am having difficulties in sending a variable containing a list from one function to another. I need to do so, as I need to combine the values from one page along with another at the end of the script. The code is as follows:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.http.request import Request
from dirbot.items import Website
from scrapy.contrib.spiders import CrawlSpider,Rule
from six import string_types
from datetime import datetime
from decimal import Decimal
import itertools
import numpy
import urlparse
import scrapy
class DmozSpider(Spider):
name = "dnot"
allowed_domains = ["ca.finance.yahoo.com", "http://eoddata.com/"]
start_urls = [
"http://eoddata.com/stocklist/TSX.htm"
]
def parse(self,response):
companyList = response.xpath('//tr[#class="ro"]/td/a/text()').extract()
for company in companyList:
go = 'https://ca.finance.yahoo.com/q/hp?s={0}.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m'.format(company)
for link in go:
yield Request(go, self.stocks1)
def stocks1(self, response):
# global returns_page1
# EAFP = Easier to ask for forgiveness then permission
# Gathers ONLY adjusted closing stock price
global returns_page1
returns_page1 = []
rows = response.xpath('//table[#class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
datetime.strptime(cells[0], "%b %d, %Y")
values = cells[-1]
returns_page1.append(values)
except ValueError:
continue
current_page = response.url
next_page = current_page + "&z=66&y=66"
yield Request(next_page, self.stocks2)
def stocks2(self, response):
item = Website()
global returns_page1
returns_page2 = []
rows = response.xpath('//table[#class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
datetime.strptime(cells[0], "%b %d, %Y")
values = cells[-1]
returns_page2.append(values)
except ValueError:
continue
returns_tot = returns_page1 + returns_page2
returns_dec = [Decimal(float(i)) for i in returns_tot]
returns = [float(n) for n in returns_dec]
items = []
item = Website()
item['url'] = response.url
item['name'] = response.xpath('//div[#class="title"]/h2/text()').extract()
item['avgreturns'] = numpy.mean(returns)
item['varreturns'] = numpy.var(returns)
item['sdreturns'] = numpy.std(returns)
item['returns'] = returns
items.append(item)
yield item
I am trying to combine returns_page1 from the def stocks1 function with returns_page2 that is gathered in the def stocks2 function. However my output is only giving me the values from the returns_page2 variable.
I know I can't put a return in the def stocks1 function because I have a yield in there. That's why I tried using global variables.
What am I doing wrong here?
Best way of passing values from one function to another is using meta in request,
in first function
yield Request(next_page, self.stocks2, meta={'returns_page1': returns_page1})
in second function
returns_page1 = response.meta.get('returns_page1')

Scrapy multiple search terms

I am very new to Python and I am in the process of learning on how scrape web pages (1 day in). The task I want to achieve is to loop through a list of 2000 companies and extract revenue data and the number of employees. I started by using scrapy, and I have managed to get the workflow to work for one company (not elegant, but at least I am trying)- but I cannot figure out how I can load the list of companies and loop through to carry out multiple searches. I have a feeling this is a fairly simple procedure.
So, my main question is - where in the spider class should I define the query array of companies to loop through? I do not know the exact URLs since each company has a unique ID and belongs to specific market - so I can not input them as start_urls.
Is Scrapy the right tool or should I have used mechanize - for this type of task?
Here is my current code.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import FormRequest
from scrapy.http import Request
from tutorial.items import DmozItem
import json
class DmozSpider(BaseSpider):
name = "dmoz"
allowed_domains = ["proff.se"]
start_urls = ["http://www.proff.se"]
# Search on the website, currently I have just put in a static search term here, but I would like to loop over a list of companies.
def parse(self, response):
return FormRequest.from_response(response, formdata={'q': rebtel},callback=self.search_result)
# I fetch the url from the search result and convert it to correct Financial-url where the information is located.
def search_result(self,response):
sel = HtmlXPathSelector(response)
link = sel.xpath('//ul[#class="company-list two-columns"]/li/a/#href').extract()
finance_url=str(link[0]).replace("/foretag","http://www.proff.se/nyckeltal")
return Request(finance_url,callback=self.parse_finance)
# I Scraped the information of this particular company, this is hardcoded and will not
# work for other responses. I had some issues with the encoding characters
# initially since they were Swedish. I also tried to target the Json element direct by
# revenue = sel.xpath('#//*[#id="accountTable1"]/tbody/tr[3]/#data-chart').extract()
# but was not able to parse it (error - expected string or buffer - tried to convert it
# to a string by str() with no luck, something off with the formatting, which is messing the the data types.
def parse_finance(self, response):
sel = HtmlXPathSelector(response)
datachart = sel.xpath('//tr/#data-chart').extract()
employees=json.loads(datachart[36])
revenue = json.loads(datachart[0])
items = []
item = DmozItem()
item['company']=response.url.split("/")[-5]
item['market']=response.url.split("/")[-3]
item['employees']=employees
item['revenue']=revenue
items.append(item)
return item
The common approach is to do this with a command-line argument. Give the spider's __init__ method an argument:
class ProffSpider(BaseSpider):
name = "proff"
...
def __init__(self, query):
self.query = query
def parse(self, response):
return FormRequest.from_response(response,
formdata={'q': self.query},
callback=self.search_result
)
...
And then start your spiders (maybe with Scrapyd):
$ scrapy crawl proff -a query="something"
$ scrapy crawl proff -a query="something else"
If you want to run a bunch of spiders at once by passing in the arguments from a file, you can create a new command to run multiple instances of a spider. This is just mixing the builtin crawl command with the example code for running multiple spiders with a single crawler:
your_project/settings.py
COMMANDS_MODULE = 'your_project_module.commands'
your_project/commands/__init__.py
# empty file
your_project/commands/crawl_many.py
import os
import csv
from scrapy.commands import ScrapyCommand
from scrapy.utils.python import without_none_values
from scrapy.exceptions import UsageError
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Run many instances of a spider'
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option('-f', '--input-file', metavar='FILE', help='CSV file to load arguments from')
parser.add_option('-o', '--output', metavar='FILE', help='dump scraped items into FILE (use - for stdout)')
parser.add_option('-t', '--output-format', metavar='FORMAT', help='format to use for dumping items with -o')
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
if not opts.output:
return
if opts.output == '-':
self.settings.set('FEED_URI', 'stdout:', priority='cmdline')
else:
self.settings.set('FEED_URI', opts.output, priority='cmdline')
feed_exporters = without_none_values(self.settings.getwithbase('FEED_EXPORTERS'))
valid_output_formats = feed_exporters.keys()
if not opts.output_format:
opts.output_format = os.path.splitext(opts.output)[1].replace('.', '')
if opts.output_format not in valid_output_formats:
raise UsageError('Unrecognized output format "%s". Valid formats are: %s' % (opts.output_format, tuple(valid_output_formats)))
self.settings.set('FEED_FORMAT', opts.output_format, priority='cmdline')
def run(self, args, opts):
if args:
raise UsageError()
with open(opts.input_file, 'rb') as handle:
for spider_options in csv.DictReader(handle):
spider = spider_options.pop('spider')
self.crawler_process.crawl(spider, **spider_options)
self.crawler_process.start()
You can run it like so:
$ scrapy crawl_many -f crawl_options.csv -o output_file.jsonl
The format of the crawl options CSV is simple:
spider,query,arg2,arg3
proff,query1,value2,value3
proff,query2,foo,bar
proff,query3,baz,asd
The first thing I'd do is to create a list of companies and find a way to get the url of each one. After this crawling is easy. I have written a crawler to extract disease information from wikipedia from a list of diseases. See how it fits your use case.
import requests
from bs4 import BeautifulSoup
import sys
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
from subprocess import Popen, check_call
from multiprocessing import Pool
#nltk.download()
def crawlwiki(keywords):
print (keywords)
columns = ['Category', 'Text']
page=1
print ('Fetching for {}....'.format(keywords))
url = 'https://en.wikipedia.org/wiki/'
for i in range(len(keywords)):
url = url + keywords[i]
url = url + '%20'
url = url[0:(len(url)-3)]
output_obj = {}
#curr_page = url+str(page)
while True:
try:
page_source = requests.get(url)
except:
#What you should do if internet connection fails
break
plain_text = page_source.text
bs_obj = BeautifulSoup(plain_text, "lxml")
'''toc_links = bs_obj.findAll('div', {'class': 'toc-links'})
base_url = 'http://www.webmd.com'
for div in toc_links:
links = div.findAll('a')
for a in links:
output_obj[a.text] = base_url + a.get('href')
print (base_url + a.get('href'))
data = bs_obj.findAll('div', {'class':'search-text-container'})
for div in data:
links = div.findAll('a')
for a in links:
output_obj[a.text] = a.get('href')
print (a.get('href'))'''
"""
Mapping:
1 : Signs and symptoms
2 : Diagnosis
3 : Prognosis
4 : Treatment
"""
symptom_text = re.findall ( '<h2><span class="mw-headline" id="Signs_and_symptoms">Signs and symptoms</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(symptom_text)
symptoms_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
symptom_data = symptoms_object.findAll('p')
symptom_paragraphs = ""
for p in symptom_data:
symptom_paragraphs += p.text
symptom_paragraphs = re.sub(r"/?\[\d+]" , '', symptom_paragraphs, re.DOTALL)
df_1 = pd.DataFrame(data=[['1', symptom_paragraphs]], columns=columns)
diagnosis_text = re.findall ( '<h2><span class="mw-headline" id="Diagnosis">Diagnosis</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(diagnosis_text)
diagnosis_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
diagnosis_data = diagnosis_object.findAll('p')
diagnosis_paragraphs = ""
for p in diagnosis_data:
diagnosis_paragraphs += p.text
diagnosis_paragraphs = re.sub(r"/?\[\d+]" , '', diagnosis_paragraphs, re.DOTALL)
df_2 = pd.DataFrame(data=[['2', diagnosis_paragraphs]], columns=columns)
prognosis_text = re.findall ( '<h2><span class="mw-headline" id="Prognosis">Prognosis</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(prognosis_text)
prognosis_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
prognosis_data = prognosis_object.findAll('p')
prognosis_paragraphs = ""
for p in prognosis_data:
prognosis_paragraphs += p.text
prognosis_paragraphs = re.sub(r"/?\[\d+]" , '', prognosis_paragraphs, re.DOTALL)
df_3 = pd.DataFrame(data=[['3', prognosis_paragraphs]], columns=columns)
treatment_text = re.findall ( '<h2><span class="mw-headline" id="Treatment">Treatment</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(treatment_text)
treatment_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
treatment_data = treatment_object.findAll('p')
treatment_paragraphs = ""
for p in treatment_data:
treatment_paragraphs += p.text
treatment_paragraphs = re.sub(r"/?\[\d+]" , '', treatment_paragraphs, re.DOTALL)
df_4 = pd.DataFrame(data=[['4', treatment_paragraphs]], columns=columns)
df = pd.DataFrame(columns = columns)
df = df.append(df_1.append(df_2.append(df_3.append(df_4))))
return df
print('Fetch completed....')
def main():
disease_df = pd.read_csv("disease.txt", sep="\n", header=None)
columns = ['Category', 'Text']
df_data = pd.DataFrame(columns=columns)
size = disease_df.size
print("Initializing....")
p = Pool(5)
df_data = p.map(crawlwiki, disease_df.values.tolist())
"""for index, row in disease_df.iterrows():
print('Iteration {0} out of {1}.....'.format(index+1, size))
df = crawlwiki(row, columns)
df_data = df_data.append(df)"""
df_data.to_csv("TagDataset.csv", index=False)
if __name__ == '__main__':
main()

Categories

Resources