I am trying to scrape twitter.
please enter search.twitter.com and put Comorbidity in search form.
I can get first page correctly, I can see when scrolling down for more tweets, the next page can be gotten from min_position param.
But when send request with next page, I can't get correct content.
Here is my some code.
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
def start_requests(self):
yield Request(url=self.start_urls[0], callback=self.parse_search_page)
def parse_search_page(self, response):
keyword = 'Comorbidity'
search_url = self.search_url.format(keyword=keyword)
yield Request(url=search_url, callback=self.parse_twitter_page, headers=self.headers)
def parse_twitter_page(self, response):
next_page = None
if self.current_page == 0:
posts = response.xpath('//li[#data-item-type="tweet"]').extract()
min_position = re.search('data-min-position="(.*?)"', response.body)
if min_position:
min_position = min_position.group(1)
next_page = self.next_page_url.format(position=min_position.replace('cm+', 'cm%2B').replace('==', '%3D%3D'))
self.current_page = 1
else:
json_data = json.loads(response.body)
min_position = json_data.get('min_position')
if next_page:
yield scrapy.http.Request(
url=self.next_page_url,
callback=self.parse_twitter_page,
headers=self.headers,
)
How can I get correct min_position?
I think you had a wrong in parse_twitter_page method.
if next_page:
yield scrapy.http.Request(
url=next_page,
callback=self.parse_twitter_page,
headers=self.headers,
)
It should not be self.next_page_url.
I changed from self.next_page_url to next_page
I hope this will works.
Related
I'm building a Scrapy crawler/spider for a determined website where I will send scrapy a starting url (let's call it start_urls) and it will get the response of all the urls (based on determined parameters) that are contained on it
Summarizing: It should enter into start_url and then search for the company_urlsĀ given an allow parameter in the rule and return the response of each company url (everything done with headers). My code is only parsing the start_urls, not considering the urls I want to extract for the parser. What could be wrong?
LinkExtractor
link_extractor = LinkExtractor(
allow=['/organization/'],
allow_domains=['www.scrapsite.com'],
deny_extensions=IGNORED_EXTENSIONS, # Filter *.zip, *.csv, etc (add other extensions as required)
process_value=lambda url: process_url_value(url, NAME, cleaning_regex=[company_regex]),
)
ScrapySpider
class scrapsiteSpider(CrawlSpider):
name = NAME
download_delay = 5.0
main_page = MAIN_PAGE
HEADERS = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
}
start_urls = [
f'https://www.scrapsite.com/search/companies/field/companies/company_page/{rank}'
for rank in range(75, 132, 10)
rules = [Rule(link_extractor, callback='parse', follow=True)]
headers = HEADERS
#classmethod
def start_requests(cls):
logger.info('Starting scrapsite scraping')
for url in cls.start_urls:
cls.log_counter += 1
if cls.log_counter % cls.log_divider == 0:
logger.info(f'Start request: {url}')
yield Request(url, dont_filter=True, headers=HEADERS)
#classmethod
def parse(cls, response: Response):
# CAPTURE COMPANIES
logger.info(f"#### parse PREPROCESSING company {response.url}")
logger.info(f"{response.meta}")
if company_regex.search(response.url):
logger.info(f"Company Detected: {response.url.split('/')[-1]}")
return cls.parse_item(response, AddedItem())
#classmethod
def parse_item(cls, response: Response, item: Item) -> Item:
logger.info(f"#### parse_item PREPROCESSING company {response.url}")
item.set_url(value=response.url)
item.set_source(value=cls.name)
item.set_response_data(value=response.text)
item.set_uuid(value=make_id_from_url(url=response.url))
yield item
I do not anything how to scrape ajax pages there is no pagination on website the website will be load by clicking the load more button these is the page link https://aaos22.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false
import scrapy
from scrapy.http import Request
from selenium import webdriver
from scrapy_selenium import SeleniumRequest
import pandas as pd
class TestSpider(scrapy.Spider):
name = 'test'
def start_requests(self):
yield SeleniumRequest(
url="https://aaos22.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false",
wait_time=3,
screenshot=True,
callback=self.parse,
dont_filter=True
)
def parse(self, response):
books = response.xpath("//h3[#class='card-Title\nbreak-word\nf3\nmb1\nmt0']//a//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
title = response.css(".mr3-m::text").get()
address = response.css(".showcase-address::text").get()
address=address.strip()
website = response.xpath("//li[#class='dib ml3 mr3']//a[starts-with(#href, 'http')]/#href").get()
website=website.strip()
phone = response.xpath("//li[#class='dib ml3 mr3'] //span[contains(text(), 'Phone:')]/following-sibling::text()").get()
phone=phone.strip().replace("-","")
yield{
'title':title,
'address':address,
'website':website,
'phone':phone
}
Okay, try the following script to get all the fields you wish to grab from there traversing all the exhibitor list:
import scrapy
from scrapy.selector import Selector
class MapYourShowSpider(scrapy.Spider):
name = "mapyourshow"
content_url = 'https://aaos22.mapyourshow.com/8_0/ajax/remote-proxy.cfm'
inner_base = 'https://aaos22.mapyourshow.com/8_0/exhibitor/exhibitor-details.cfm?exhid={}'
headers = {
'x-requested-with': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
params = {
'action': 'search',
'searchtype': 'exhibitorgallery',
'searchsize': '557',
'start': '0',
}
def start_requests(self):
yield scrapy.FormRequest(
url=self.content_url,
method='GET',
headers=self.headers,
formdata=self.params,
callback=self.parse,
)
def parse(self,response):
for item in response.json()['DATA']['results']['exhibitor']['hit']:
inner_link = self.inner_base.format(item['fields']['exhid_l'])
yield scrapy.Request(
url=inner_link,
headers=self.headers,
callback=self.parse_content,
)
def parse_content(self,response):
elem = response.json()['DATA']['BODYHTML']
sel = Selector(text=elem)
title = sel.css("h2::text").get()
try:
address = ' '.join([' '.join(i.split()) for i in sel.css("p.showcase-address::text").getall()])
except AttributeError: address = ""
website = sel.css("a[title*='website']::text").get()
phone = sel.xpath("normalize-space(//*[starts-with(#class,'showcase-web-phone')]/li[./*[.='Phone:']]/span/following::text())").get()
yield {"title":title,"address":address,"website":website,"phone":phone}
I have not used your code and did it rather my way (because I'm not a huge fan of selenium). But I hope this helps anyway:
import requests
import json
import time
from bs4 import BeautifulSoup
import re
headers = {
'x-requested-with': 'XMLHttpRequest',
}
params = {
'action': 'search',
'searchtype': 'exhibitorgallery',
'searchsize': '200', # don`t increase this too much (increase the start parameter instead and send a new request after some delay)
'start': '0',
}
response = requests.get('https://aaos22.mapyourshow.com/8_0/ajax/remote-proxy.cfm', params=params, headers=headers)
data = json.loads(response.text)
all_sites = []
for exs in data["DATA"]["results"]["exhibitor"]["hit"]:
id = exs["fields"]["exhid_l"]
site = f"https://aaos22.mapyourshow.com/8_0/exhibitor/exhibitor-details.cfm?exhid={id}"
all_sites.append(site)
for site in all_sites:
response = requests.get(site)
soup = BeautifulSoup(response.text, "html.parser")
info_box = soup.find("div", {"id":"showroomContentDiv"})
title = info_box.find("section", {"id":"scroll-description"}).text.strip().split("\n")[0][6:]
address = " ".join(info_box.find("p", {"class":"showcase-address"}).text.strip().split())
website = info_box.find("ul", {"class":"showcase-web-phone"}).find_all("li")[0].text.strip()
phone = info_box.find("ul", {"class":"showcase-web-phone"}).find_all("li")[1].text[7:].strip()
print(title)
print(address)
print(website)
print(phone)
# delay so you don't create too much traffic
time.sleep(1)
I have a problem with going to next page they will go to next page but then they will again return to first page and they will give only the data of page 1 I have trying different approches but I am not successfull to solve these problem if any solution then provide me this is page link https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx
import scrapy
from selenium import webdriver
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
if url.endswith('.ro') or url.endswith('.ro/'):
continue
yield Request(url, callback=self.parse_book)
def __init__(self):
self.driver = webdriver.Chrome('C:\Program Files (x86)\chromedriver.exe')
def parse_book(self, response):
title=response.xpath("//span[#id='HeadingContent_lblTitle']//text()").get()
d1=response.xpath("//div[#class='col-md-10']//p[1]//text()").get()
d1=d1.strip()
d2=response.xpath("//div[#class='col-md-10']//p[2]//text()").get()
d2=d2.strip()
d3=response.xpath("//div[#class='col-md-10']//p[3]//span//text()").get()
d3=d3.strip()
d4=response.xpath("//div[#class='col-md-10']//p[4]//text()").get()
d4=d4.strip()
yield{
"title1":title,
"title2":d1,
"title3":d2,
"title4":d3,
"title5":d4,
}
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath("//a[#id='MainContent_PagerTop_NavNext']")
try:
next.click()
# get the data and write it to scrapy items
except:
break
I've built a simple scrapy spider running on scrapinghub:
class ExtractionSpider(scrapy.Spider):
name = "extraction"
allowed_domains = ['domain']
start_urls = ['http://somedomainstart']
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
def parse(self, response):
urls = response.css('a.offer-details__title-link::attr(href)').extract()
print(urls)
for url in urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)').extract()
print(multiple_locs_urls)
for url in multiple_locs_urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
next_page_url = response.css('li.pagination_element--next > a.pagination_trigger::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield SplashRequest(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
'title': response.css('#jobTitle').extract_first(),
'content': response.css('#description').extract_first(),
'datePosted': response.css('span[itemprop="datePosted"]').extract_first(),
'address': response.css('span[itemprop="address"]').extract_first()
}
The problem I am facing is that the multiple_locs_url response.css returns an empty array despite me seeing it in the markup on the browser side.
I checked with scrapy shell and scrapy shell does not see the markup. I guess this is due to the markup being rendered through javascript when the page is loaded.
I added splash but that does not seem to apply to response. How would I make scrapy wait with the query until the page is loaded?
See source code for the page: view-source:pracuj.pl/praca/polska;ct,1 .
There is no element with class "offer-regions__label" in html code.
This code will always return an empty list:
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)')
But as explained here https://stackoverflow.com/a/17697329/9913319:
Many times when crawling we run into problems where content that is
rendered on the page is generated with Javascript and therefore scrapy
is unable to crawl for it.
In this case you can use Selenium.
I changed your code and checked it and it works:
class ExtractionSpider(scrapy.Spider):
name = "extraction"
allowed_domains = ['domain']
start_urls = ['http://somedomainstart']
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
def __init__( self, **kwargs ):
super().__init__( **kwargs )
profile = webdriver.FirefoxProfile( "pathToFirefoxProfile" )
firefox_binary = "pathToFirefoxBinary" # Must be the developer edition!!!
# self.driver = webdriver.Firefox()
self.driver = webdriver.Firefox( profile, firefox_binary = firefox_binary )
def parse(self, response):
self.driver.get( response.url )
elements = self.driver.find_elements_by_css_selector( "a.offer-details__title-link" )
self.driver.get( response.url )
for element in elements:
print( "****" )
print( str( element.get_attribute( "href" ) ) )
print( str( element.text ) )
# your old code below
urls = response.css('a.offer-details__title-link::attr(href)').extract()
print(urls)
for url in urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)').extract()
print(multiple_locs_urls)
for url in multiple_locs_urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
next_page_url = response.css('li.pagination_element--next > a.pagination_trigger::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield SplashRequest(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
'title': response.css('#jobTitle').extract_first(),
'content': response.css('#description').extract_first(),
'datePosted': response.css('span[itemprop="datePosted"]').extract_first(),
'address': response.css('span[itemprop="address"]').extract_first()
}
I'm doing a Web Crawling project, which needs to collect user's comments from each of the 200,000 authors' videos on a video website. Recently, this website updated its URL, create a new parameter(_signature) into their API URL. Is there any suggestion to fetch this new parameter?
The example web API URLs & URL are as below: https://www.ixigua.com/api/comment_module/video_comment?_signature=vhm.AAgEAy3U9zpQ3OMV74fpuAAOL1&item_id=6698972531753222663&group_id=6698972531753222663&offset=10
refers to: https://www.ixigua.com/i6698972531753222663/
https://www.ixigua.com/api/comment_module/video_comment?_signature=Xs6IHAAgEABXgvIJnklsal7OiAAAAI3&item_id=6699046583612211720&group_id=6699046583612211720&offset=10
refers to: https://www.ixigua.com/i6699046583612211720/
What I had to reach the original 200,000 authors is a list of their item_id/group_id(I stored them in Amazon A3, you may find it in my code below). Plus, item_id is the same as the group_id. So to move on, all I need is the _signature.
For different author, this website assign a unique _signature for them. In the example API URL, First author's _signature is: vh-m.AAgEAy3U9zpQ3OMV74fpuAAOL1. Second is Xs6IHAAgEABXgvIJnklsal7OiAAAAI3
This is what i am meeting trouble with. I went through the website and found it under XHR as a part of query string parameter but have no idea how to fetch it. Before this update, the API URL doesn't include the _signature. My original code went smoothly as below:
class Id1Spider(scrapy.Spider):
name = 'id1'
allowed_domains = ['www.ixigua.com']
df = pd.read_csv('https://s3.amazonaws.com/xiguaid/group_id.csv')
df = df.iloc[32640:189680,1]
list_id = df.unique().tolist()
i = 0
start_urls = ["https://www.ixigua.com/api/comment/list/?group_id="+ str(list_id[i]) +"&item_id="+ str(list_id[i]) +"&offset=0&count=20"]
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
offset = 0
count = 20
def parse(self, response):
data = json.loads(response.body)
comments = data['data']['comments']
total = data['data']['total']
for ele in list(comments):
try:
comments_id=ele['user']['user_id']
comments_text=ele['text']
reply_count=ele['reply_count']
digg_count=ele['digg_count']
create_time=ele['create_time']
except:
pass
item = {
'comments_id':comments_id,
'comments_text':comments_text,
'reply_count':reply_count,
'digg_count':digg_count,
'create_time':create_time,
'item_id': self.list_id[self.i]
}
yield item
if data['data']['has_more']:
self.offset += 20
if self.offset > total:
self.offset = total
elif self.offset <= total:
self.offset = self.offset
next_page_url = "https://www.ixigua.com/api/comment/list/?group_id="+ str(self.list_id[self.i]) +"&item_id="+ str(self.list_id[self.i]) +"&offset=" + str(self.offset)+"&count=" + str(self.count)
yield scrapy.Request(url = next_page_url, callback = self.parse)
else:
self.offset = 0
self.count = 20
self.i = self.i + 1
try:
next_page_url = "https://www.ixigua.com/api/comment/list/?group_id="+ str(self.list_id[self.i]) +"&item_id="+ str(self.list_id[self.i]) +"&offset=" + str(self.offset)+"&count=" + str(self.count)
yield scrapy.Request(url = next_page_url, callback = self.parse)
except:
pass
Thanks in advance for any suggestions!!! Please do let me know if I miss any information for solving this problem since this is my first post.