How to get search result URLs by Pyppeteer? - python

I am trying to scrape the searching result's url by Pyppeteer in my Python program, but it doesn't work...
And here is my code:
import asyncio
from pyppeteer import launch
URL = 'https://hk.appledaily.com/search/apple'
async def main():
browser = await launch()
page = await browser.newPage()
await page.goto(URL)
await page.waitForSelector(".flex-feature")
elements = await page.querySelectorAll('.flex-feature')
for el in elements:
text = await page.evaluate('(el) => el.innerHTML.querySelectorAll("story-card")', el)
print(text)
await browser.close()
asyncio.get_event_loop().run_until_complete(main())
Hope anyone can help! Thanks!

Sorry for stupid question! I have done it just now haha...
import asyncio
from pyppeteer import launch
# https://pypi.org/project/pyppeteer/
URL = 'https://hk.appledaily.com/search/apple'
async def main():
browser = await launch()
page = await browser.newPage()
await page.goto(URL)
await page.waitForSelector(".flex-feature")
elements = await page.querySelectorAll('.flex-feature')
elements = await page.querySelectorAll('.story-card')
for el in elements:
text = await page.evaluate('(el) => el.textContent', el)
text2 = await page.evaluate('(el) => el.href', el)
print(text2)
await browser.close()
asyncio.get_event_loop().run_until_complete(main())
And the result will be:
https://hk.appledaily.com/entertainment/20201227/LJL5DQ64QZHLTHI7LFKHVXB7JM/
https://hk.appledaily.com/sports/20201227/7MQKJNXPQNA6HDXTFUCMWNGUAU/
https://hk.appledaily.com/local/20201227/SWIBOUDSLZB5JBULTIT4DPSEIQ/
https://hk.appledaily.com/entertainment/20201227/TA457F5YYRGQZCNDIR5OFJDLPU/
https://hk.appledaily.com/china/20201227/DY2RQZJVSZHJBDV6XDYBH5G73I/
https://hk.appledaily.com/sports/20201227/4FLJFIHZOFF3JMWPOOSTO5QLCQ/
https://hk.appledaily.com/local/20201227/NIWG4U4LBFGPHLA73RTWHEQCY4/
https://hk.appledaily.com/china/20201227/SUR6Q4UEIVE5HD7OLSCAYIVUUY/
https://hk.appledaily.com/international/20201227/N2P5IPMBKBEGRALQWMDFXJCVGY/
https://hk.appledaily.com/entertainment/20201227/MGG6H2JIJVGODEV3EE7OI6HEGI/
https://hk.appledaily.com/local/20201227/N3TQO3VOBRC3NKT2ILES76CSKY/
https://hk.appledaily.com/international/20201227/GJXFM53DAFAUVOFFZIRKBH3X24/
https://hk.appledaily.com/sports/20201227/2UQC7A4HCBFD5IF7IGJWVK3AOA/
https://hk.appledaily.com/entertainment/20201226/AI7CAJD6O5D5XP7UMZCWSQ5VU4/
https://hk.appledaily.com/entertainment/20201227/3BIOQMUCQVGHXKNP3A4KF7VC6A/
https://hk.appledaily.com/local/20201227/OOYOPLI5WFGJZGAFKGLHSVINPM/
https://hk.appledaily.com/local/20201227/6FXZ5FKNMVHS5JTTO6YWO55JZY/
https://hk.appledaily.com/local/20201227/VQTZMOKCUZGMFL4PYBZ5YZYOSQ/
https://hk.appledaily.com/international/20201227/4VPFDXJFKZH5ZFRXSKZW3OASAA/
https://hk.appledaily.com/entertainment/20201227/TCVCDXKK4JHE7HHEJ7U6MFSS5U/
https://hk.appledaily.com/local/20201227/NIWG4U4LBFGPHLA73RTWHEQCY4/
https://hk.appledaily.com/entertainment/20201227/GY4WJIFLPREKJHGJ2VQO7LDZAU/
https://hk.appledaily.com/entertainment/20201227/3BIOQMUCQVGHXKNP3A4KF7VC6A/
https://hk.appledaily.com/local/20201227/OOYOPLI5WFGJZGAFKGLHSVINPM/
https://hk.appledaily.com/local/20201227/N3TQO3VOBRC3NKT2ILES76CSKY/
https://hk.appledaily.com/local/20201227/Z4CRG7TLUJFMLO3JIY2KWBTL5A/
https://hk.appledaily.com/local/20201227/353WEBFTBZFHBCP2O4IXIARBEM/
Process finished with exit code 0

Related

Getting duplicate Network Response Playwright Python

I have a working script but when I changed the page.on in playwright it actually runs the network response a certain number of times as per the loop count. I have been trying to figure out why that happens.
For example at i=0 it gives one response.url print but at i=10 it prints response.url 10 times and then send 10 duplicate data to mongodb. I have no idea why this is happening. The link being sent based on the print are all the same.
Would be a great help if anyone can let me know what it is that I am doing wrong that is causing this issue.
Pls see sample code here.
#imports here
today = datetime.today().strftime("%m%d%Y")
filenamearr = []
mongousername = 'XXX'
mongopassword = 'XXXX'
client = MongoClient("mongodb+srv://%s:%s#XXXXX.XXXX.mongodb.net/?retryWrites=true&w=majority"%(mongousername,mongopassword))
db = client.DB1
logg = []
async def runbrowser(playwright,url):
async def handle_response(response,buttonnumber):
l = str(response.url)
para = 'param'
if para in l:
print(response.url)
textdata = await response.text()
subtask = asyncio.create_task(jsonparse(textdata))
done, pending = await asyncio.wait({subtask})
if subtask in done:
print("Success in Json parser")
result = await subtask
status = [buttonnumber,result]
logg.append(status)
print(status)
logdf = pd.DataFrame(logg)
logdf.columns = ['BUTTON','RESULT']
fname = 'XXXX' + today +".csv"
logdf.to_csv(fname,index=False)
async def jsonparse(textdata):
try:
#parsing happens here to output to MongoDB
return "Success"
except Exception as e:
print("Failled parsing")
return e
browser = await playwright.firefox.launch(
headless=True,
)
context = await browser.new_context(
locale='en-US',
ignore_https_errors = True,
)
page = await context.new_page()
await page.goto(url,timeout=0)
button = page.locator("xpath=//button[#event-list-item='']")
bcount = button.locator(":scope",has_text="Locator")
count = await bcount.count()
print(count)
for i in range(count):
print("\n\n\n\n\nSleeping 10 seconds before clicking button")
buttonnumber = i
await asyncio.sleep(10)
print("Clickking Button: ", i)
cbtn = bcount.nth(i)
await cbtn.hover()
await asyncio.sleep(4)
await cbtn.click()
if i==0:
print("i=0")
await page.reload(timeout=0)
retry = page.on("response",lambda response: handle_response(response,buttonnumber))
title = await page.title()
print(title)
print("Heading back to the main page.")
await page.go_back(timeout=0)
await page.reload()
await page.wait_for_timeout(5000)
await page.close()
print("Closing Tab")
await browser.close()
async def main():
tasks = []
async with async_playwright() as playwright:
url = 'https://samplelink.com'
tasks.append(asyncio.create_task(runbrowser(playwright,url)))
for t in asyncio.as_completed(tasks):
print(await t)
await asyncio.gather(*tasks)
asyncio.run(main())

How can i solve these RuntimeWarning errors?

import threading
import asyncio
import discord
import requests
from bs4 import BeautifulSoup
client = discord.Client()
def set_interval(func, sec):
async def func_wrapper():
set_interval(func, 1)
await func()
t = threading.Timer(sec, func_wrapper)
t.start()
return t
async def takip():
url = ""
R = requests.get(url)
Soup = BeautifulSoup(R.text, "html5lib")
Title = Soup.find("h1", {"class": "pr-new-br"}).getText()
List = Soup.find("div", {"class": "pr-bx-nm with-org-prc"})
fiyat = List.find("span", {"class": "prc-dsc"}).getText()
degisenfiyat = float(fiyat.replace(",", ".").replace(" TL", ""))
if (degisenfiyat <= 200):
channel = client.get_channel(973939538357522474)
await channel.send("Fiyat düştü.")
print(Title)
print(fiyat)
print(degisenfiyat)
#client.event
async def on_ready():
print(f'{client.user} has connected to Discord!')
set_interval(takip, 1)
#client.event
async def on_message(message):
if message.author == client.user:
return
if message.content.startswith('$hello'):
await message.channel.send('Hello!')
client.run("")
RuntimeWarning: coroutine 'set_interval.<locals>.func_wrapper' was never awaited
self.function(*self.args, **self.kwargs)
RuntimeWarning: Enable tracemalloc to get the object allocation traceback
This is a function works with discord.py to solve a scraping problem. But when trying to use set_interval function gives this error. How to solve it any idea?

Discord bot does not send message to channel

I'm trying to make my bot send a message to a discord channel if certain conditions are met, but I can't seem to get the code working. The code checks every 5 seconds if a list contains the string '.12.' and should then forward the message.
import requests
import time
import discord
from discord.ext import commands, tasks
from bs4 import BeautifulSoup
while True:
client = commands.Bot(command_prefix='.')
#client.event
async def on_ready():
print('bot is active')
url = 'website link'
res = requests.get(url)
html = res.text
soup = BeautifulSoup(html, 'html.parser')
html_element = soup.find_all( 'td', { "class" : "eksam-ajad-aeg" } )
ret = []
for t in html_element:
ret.append(t.text)
print(ret)
if '.12.' in ret:
#client.event
async def send():
channel = client.get_channel(758088198852182037)
await channel.send('message')
client.run('token')
time.sleep(5)
Here is a bot script that appears to be working. Without having the url that you're attempting to search, I'm not able to help completely, but give this a try and see if it works for you:
import discord
import requests
from bs4 import BeautifulSoup
import asyncio
client = discord.Client()
#client.event
async def on_ready():
# Create a task and run check_html and feed it a parameter of 5 seconds
client.loop.create_task(check_html(5))
print("Bot is active")
async def check_html(time):
while True:
url = 'url here'
res = requests.get(url)
html = res.text
soup = BeautifulSoup(html, 'html.parser')
html_element = soup.find_all( 'td', { "class" : "eksam-ajad-aeg" } )
ret = []
for t in html_element:
ret.append(t.text)
print(ret)
if '.12.' in ret:
for guild in client.guilds:
for channel in guild.channels:
if channel.id == 758088198852182037:
await channel.send('message')
# Asyncronously sleep for 'time' seconds
await asyncio.sleep(time)
client.run('token')

How to change async crawl program based on aiohttp lib faster?

I am using aysnc and aiohttp to crawl web image, but when it was running, I found it was not crawling as fast as I expected.
Is there any code that I can improve there?
In the for loop I am using many await inside, is that the correct way to deal with that?
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url=url,
headers=HEADERS,
proxy=PROXY_STR,
) as response:
text = await response.text()
resp = Selector(text=text)
nodes = resp.xpath('//div[#class="kl1-2"]')
for node in nodes:
next_url = node.xpath('.//div[#class="kl1-2a2"]/a/#href').extract_first()
title = node.xpath('.//div[#class="kl1-2a2"]/a/#title').extract_first()
await detail(session=session, next_url=next_url, title=title)
print('next page')
async def detail(**kwargs):
session = kwargs['session']
next_url = kwargs['next_url']
title = kwargs['title']
print(next_url)
print(title)
async with session.get(
url=next_url,
headers=HEADERS,
proxy=PROXY_STR,
) as response:
text = await response.text()
resp = Selector(text=text)
nodes = resp.xpath('//div[#class="kl2-1"]//img/#src').extract()
nodes = list(set(nodes))
for img in nodes:
await download_img(session=session,url=img,title=title)
print('next image')
async def download_img(**kwargs):
url= kwargs['url']
title= kwargs['title']
try:
conn = aiohttp.TCPConnector(ssl=False) # 防止ssl报错
async with aiohttp.ClientSession(connector=conn, trust_env=True) as session:
async with session.get(url=url, headers=SIMPLE_HEADERS, proxy=PROXY_STR) as response:
if response.status>=200 and response.status<300:
f=await aiofiles.open(save_file,'wb')
await f.write(await response.read())
await f.close()
except Exception as e:
return
async def main():
total_page = 3640
for page in range(0,total_page,35):
url = START_URL.format(page=page)
await fetch(url)
await asyncio.sleep(0)
print(f'downing page {page}-')
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

Python: Pyppeteer clicking on pop up window

I'm trying to accept the cookies consent on a pop up window that is generated on this page. I tried to use waitForSelector but none of the selectors that I used seems to be visible to the headless browser. I would like to actually switch to "YES" and then submit the form. I guess it's displayed on window.onload so perhaps this will need to be done in JavaScript?
import asyncio
import time
from pyppeteer import launch
from pyppeteer.errors import TimeoutError
from urllib.parse import urlparse
URLS = [
'https://www.trustarc.com/'
]
start = time.time()
async def fetch(url, browser):
page = await browser.newPage()
try:
#await page.setRequestInterception(True)
page.on('request', callback)
await page.goto(url, {'waitUntil': 'networkidle0'})
await page.screenshot({'path': f'img/{urlparse(url)[1]}.png', 'fullPage': True})
except TimeoutError as e:
print(f'Timeout for: {url}')
finally:
await page.close()
async def callback(req):
print(f'Request: {req.url}')
async def run():
browser = await launch(headless=True, args=['--no-sandbox'])
tasks = []
for url in URLS:
task = asyncio.ensure_future(fetch(url, browser))
tasks.append(task)
ret = await asyncio.gather(*tasks)
await browser.close()
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
print(f'It took {time.time()-start} seconds.')
In case someone will find this useful, this is my Python implementation based on the accepted answer:
import asyncio
import time
from pyppeteer import launch
from pyppeteer.errors import TimeoutError
from urllib.parse import urlparse
URLS = [
'https://www.trustarc.com/'
]
start = time.time()
async def fetch(url, browser):
page = await browser.newPage()
try:
#await page.setRequestInterception(True)
#page.on('request', callback)
await page.goto(url, {'waitUntil': 'networkidle0'})
if not await page.J('.truste_overlay'):
await page.click('#teconsent > a')
cookies_frame = page.frames[1]
await cookies_frame.waitForSelector( '.active', {'visible': True})
await cookies_frame.evaluate('''() =>
{
const yes_buttons = document.getElementsByClassName( 'off' );
const submit_button = document.getElementsByClassName( 'submit' )[0];
yes_buttons[0].click();
yes_buttons[1].click();
submit_button.click();
}''')
close_button = await cookies_frame.waitForSelector( '#gwt-debug-close_id' )
await close_button.click()
await page.screenshot({'path': f'img/{urlparse(url)[1]}.png', 'fullPage': True})
except TimeoutError as e:
print(f'Timeout for: {url}')
finally:
await page.close()
async def callback(req):
print(f'Request: {req.url}')
async def run():
browser = await launch(headless=True, args=['--no-sandbox'])
tasks = []
for url in URLS:
task = asyncio.ensure_future(fetch(url, browser))
tasks.append(task)
ret = await asyncio.gather(*tasks)
await browser.close()
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
print(f'It took {time.time()-start} seconds.')
If the Cookie Preferences pop-up frame does not open up automatically, you can manually open the pop-up by clicking on the button in the bottom-right corner of the web page.
The cookie options are located in an iframe, so you will have to wait until the frame content has loaded before selecting "YES" for Functional Cookies and Advertising Cookies.
After submitting the preferences, you will need to wait for and close the confirmation message to continue using the website.
Full Example:
// Navigate to the website
await page.goto( 'https://www.trustarc.com/', { 'waitUntil' : 'networkidle0' } );
// Open the Cookie Preferences pop-up (if necessary)
if ( await page.$( '.truste_overlay' ) === null )
{
await page.click( '#teconsent > a' );
}
// Wait for the Cookie Preferences frame and content to load
const cookies_frame = page.frames()[1];
await cookies_frame.waitForSelector( '.active', { 'visible' : true } );
// Fill out and submit form
await cookies_frame.evaluate( () =>
{
const yes_buttons = document.getElementsByClassName( 'off' );
const submit_button = document.getElementsByClassName( 'submit' )[0];
yes_buttons[0].click();
yes_buttons[1].click();
submit_button.click();
});
// Wait for and close confirmation
const close_button = await cookies_frame.waitForSelector( '#gwt-debug-close_id' );
await close_button.click();

Categories

Resources