I have a working script but when I changed the page.on in playwright it actually runs the network response a certain number of times as per the loop count. I have been trying to figure out why that happens.
For example at i=0 it gives one response.url print but at i=10 it prints response.url 10 times and then send 10 duplicate data to mongodb. I have no idea why this is happening. The link being sent based on the print are all the same.
Would be a great help if anyone can let me know what it is that I am doing wrong that is causing this issue.
Pls see sample code here.
#imports here
today = datetime.today().strftime("%m%d%Y")
filenamearr = []
mongousername = 'XXX'
mongopassword = 'XXXX'
client = MongoClient("mongodb+srv://%s:%s#XXXXX.XXXX.mongodb.net/?retryWrites=true&w=majority"%(mongousername,mongopassword))
db = client.DB1
logg = []
async def runbrowser(playwright,url):
async def handle_response(response,buttonnumber):
l = str(response.url)
para = 'param'
if para in l:
print(response.url)
textdata = await response.text()
subtask = asyncio.create_task(jsonparse(textdata))
done, pending = await asyncio.wait({subtask})
if subtask in done:
print("Success in Json parser")
result = await subtask
status = [buttonnumber,result]
logg.append(status)
print(status)
logdf = pd.DataFrame(logg)
logdf.columns = ['BUTTON','RESULT']
fname = 'XXXX' + today +".csv"
logdf.to_csv(fname,index=False)
async def jsonparse(textdata):
try:
#parsing happens here to output to MongoDB
return "Success"
except Exception as e:
print("Failled parsing")
return e
browser = await playwright.firefox.launch(
headless=True,
)
context = await browser.new_context(
locale='en-US',
ignore_https_errors = True,
)
page = await context.new_page()
await page.goto(url,timeout=0)
button = page.locator("xpath=//button[#event-list-item='']")
bcount = button.locator(":scope",has_text="Locator")
count = await bcount.count()
print(count)
for i in range(count):
print("\n\n\n\n\nSleeping 10 seconds before clicking button")
buttonnumber = i
await asyncio.sleep(10)
print("Clickking Button: ", i)
cbtn = bcount.nth(i)
await cbtn.hover()
await asyncio.sleep(4)
await cbtn.click()
if i==0:
print("i=0")
await page.reload(timeout=0)
retry = page.on("response",lambda response: handle_response(response,buttonnumber))
title = await page.title()
print(title)
print("Heading back to the main page.")
await page.go_back(timeout=0)
await page.reload()
await page.wait_for_timeout(5000)
await page.close()
print("Closing Tab")
await browser.close()
async def main():
tasks = []
async with async_playwright() as playwright:
url = 'https://samplelink.com'
tasks.append(asyncio.create_task(runbrowser(playwright,url)))
for t in asyncio.as_completed(tasks):
print(await t)
await asyncio.gather(*tasks)
asyncio.run(main())
Related
async def get_html(self, url):
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers) as resp:
return await resp.text()
async def getrank(self, url):
response = await self.get_html(f'url')
print(f'{response.site} | got site')
soup = BeautifulSoup(response, "html.parser")
rank = soup.find("div", {"id": "productDetails_db_sections"})
test2 = rank.find_all("span")
rank = str(test2[-2].text).replace(",","")
finalRank = int(re.search("\d+", rank).group())
if finalRank < 20000:
print(f'product has low rank, starting new function')
await self.getPriceFinal(url, finalrank)
async def getPriceFinal(self, url, rank):
try:
print(f'Checking for Price....') #THIS PRINTS
s = aiohttp.ClientSession()
response = await s.get(f"{url}", headers = self.headers) #THIS WAITS UNTIL getrank finished
print(response.status)
The main problem I have is that the function getPriceFinal() runs to the print and after that waits for the getrank() function to finish ... however what I would like to do is to start the getPriceFinal() function with the url from getrank() function concurrently .. and ideas on how to solve this issue?
I have such part of code(Python):
def func_login_params():
async with websockets.connect(url) as ws:
# login
timestamp = str(get_local_timestamp())
login_str = login_params(timestamp, api_key, passphrase, secret_key)
await ws.send(login_str)
res = await ws.recv()
# params
sub_str = json.dumps(params)
await ws.send(sub_str)
Upper code works - okey. But I need to have two function: first - login, second - send params. As I thought I could do this:
def func_login():
async with websockets.connect(url) as ws:
# login
timestamp = str(get_local_timestamp())
login_str = login_params(timestamp, api_key, passphrase, secret_key)
await ws.send(login_str)
res = await ws.recv()
def func_params():
async with websockets.connect(url) as ws:
# params
sub_str = json.dumps(params)
await ws.send(sub_str)
But in this situation params don't come. Maybe it don't connect correctly?
When you try to connect websockets, it creates new connection.
So, in that case, you should take an websocket connection as function argument:
async def func_login(ws):
timestamp = str(get_local_timestamp())
login_str = login_params(timestamp, api_key, passphrase, secret_key)
await ws.send(login_str)
res = await ws.recv()
async def func_params(ws):
sub_str = json.dumps(params)
await ws.send(sub_str)
def main():
async with web sockets.connect(url) as ws:
await func_login(ws)
await func_params(ws)
I am trying to scrape the searching result's url by Pyppeteer in my Python program, but it doesn't work...
And here is my code:
import asyncio
from pyppeteer import launch
URL = 'https://hk.appledaily.com/search/apple'
async def main():
browser = await launch()
page = await browser.newPage()
await page.goto(URL)
await page.waitForSelector(".flex-feature")
elements = await page.querySelectorAll('.flex-feature')
for el in elements:
text = await page.evaluate('(el) => el.innerHTML.querySelectorAll("story-card")', el)
print(text)
await browser.close()
asyncio.get_event_loop().run_until_complete(main())
Hope anyone can help! Thanks!
Sorry for stupid question! I have done it just now haha...
import asyncio
from pyppeteer import launch
# https://pypi.org/project/pyppeteer/
URL = 'https://hk.appledaily.com/search/apple'
async def main():
browser = await launch()
page = await browser.newPage()
await page.goto(URL)
await page.waitForSelector(".flex-feature")
elements = await page.querySelectorAll('.flex-feature')
elements = await page.querySelectorAll('.story-card')
for el in elements:
text = await page.evaluate('(el) => el.textContent', el)
text2 = await page.evaluate('(el) => el.href', el)
print(text2)
await browser.close()
asyncio.get_event_loop().run_until_complete(main())
And the result will be:
https://hk.appledaily.com/entertainment/20201227/LJL5DQ64QZHLTHI7LFKHVXB7JM/
https://hk.appledaily.com/sports/20201227/7MQKJNXPQNA6HDXTFUCMWNGUAU/
https://hk.appledaily.com/local/20201227/SWIBOUDSLZB5JBULTIT4DPSEIQ/
https://hk.appledaily.com/entertainment/20201227/TA457F5YYRGQZCNDIR5OFJDLPU/
https://hk.appledaily.com/china/20201227/DY2RQZJVSZHJBDV6XDYBH5G73I/
https://hk.appledaily.com/sports/20201227/4FLJFIHZOFF3JMWPOOSTO5QLCQ/
https://hk.appledaily.com/local/20201227/NIWG4U4LBFGPHLA73RTWHEQCY4/
https://hk.appledaily.com/china/20201227/SUR6Q4UEIVE5HD7OLSCAYIVUUY/
https://hk.appledaily.com/international/20201227/N2P5IPMBKBEGRALQWMDFXJCVGY/
https://hk.appledaily.com/entertainment/20201227/MGG6H2JIJVGODEV3EE7OI6HEGI/
https://hk.appledaily.com/local/20201227/N3TQO3VOBRC3NKT2ILES76CSKY/
https://hk.appledaily.com/international/20201227/GJXFM53DAFAUVOFFZIRKBH3X24/
https://hk.appledaily.com/sports/20201227/2UQC7A4HCBFD5IF7IGJWVK3AOA/
https://hk.appledaily.com/entertainment/20201226/AI7CAJD6O5D5XP7UMZCWSQ5VU4/
https://hk.appledaily.com/entertainment/20201227/3BIOQMUCQVGHXKNP3A4KF7VC6A/
https://hk.appledaily.com/local/20201227/OOYOPLI5WFGJZGAFKGLHSVINPM/
https://hk.appledaily.com/local/20201227/6FXZ5FKNMVHS5JTTO6YWO55JZY/
https://hk.appledaily.com/local/20201227/VQTZMOKCUZGMFL4PYBZ5YZYOSQ/
https://hk.appledaily.com/international/20201227/4VPFDXJFKZH5ZFRXSKZW3OASAA/
https://hk.appledaily.com/entertainment/20201227/TCVCDXKK4JHE7HHEJ7U6MFSS5U/
https://hk.appledaily.com/local/20201227/NIWG4U4LBFGPHLA73RTWHEQCY4/
https://hk.appledaily.com/entertainment/20201227/GY4WJIFLPREKJHGJ2VQO7LDZAU/
https://hk.appledaily.com/entertainment/20201227/3BIOQMUCQVGHXKNP3A4KF7VC6A/
https://hk.appledaily.com/local/20201227/OOYOPLI5WFGJZGAFKGLHSVINPM/
https://hk.appledaily.com/local/20201227/N3TQO3VOBRC3NKT2ILES76CSKY/
https://hk.appledaily.com/local/20201227/Z4CRG7TLUJFMLO3JIY2KWBTL5A/
https://hk.appledaily.com/local/20201227/353WEBFTBZFHBCP2O4IXIARBEM/
Process finished with exit code 0
I am using aysnc and aiohttp to crawl web image, but when it was running, I found it was not crawling as fast as I expected.
Is there any code that I can improve there?
In the for loop I am using many await inside, is that the correct way to deal with that?
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url=url,
headers=HEADERS,
proxy=PROXY_STR,
) as response:
text = await response.text()
resp = Selector(text=text)
nodes = resp.xpath('//div[#class="kl1-2"]')
for node in nodes:
next_url = node.xpath('.//div[#class="kl1-2a2"]/a/#href').extract_first()
title = node.xpath('.//div[#class="kl1-2a2"]/a/#title').extract_first()
await detail(session=session, next_url=next_url, title=title)
print('next page')
async def detail(**kwargs):
session = kwargs['session']
next_url = kwargs['next_url']
title = kwargs['title']
print(next_url)
print(title)
async with session.get(
url=next_url,
headers=HEADERS,
proxy=PROXY_STR,
) as response:
text = await response.text()
resp = Selector(text=text)
nodes = resp.xpath('//div[#class="kl2-1"]//img/#src').extract()
nodes = list(set(nodes))
for img in nodes:
await download_img(session=session,url=img,title=title)
print('next image')
async def download_img(**kwargs):
url= kwargs['url']
title= kwargs['title']
try:
conn = aiohttp.TCPConnector(ssl=False) # 防止ssl报错
async with aiohttp.ClientSession(connector=conn, trust_env=True) as session:
async with session.get(url=url, headers=SIMPLE_HEADERS, proxy=PROXY_STR) as response:
if response.status>=200 and response.status<300:
f=await aiofiles.open(save_file,'wb')
await f.write(await response.read())
await f.close()
except Exception as e:
return
async def main():
total_page = 3640
for page in range(0,total_page,35):
url = START_URL.format(page=page)
await fetch(url)
await asyncio.sleep(0)
print(f'downing page {page}-')
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
I'm working on a python client that will asynchronously download vinyl cover art. My problem is that I'm new to python (especially asynchronous python) and I don't think my code is running ansychronously. I have another client written in Node.js that is able to get approx. 40 images/sec whereas this python one is only managing to get around 1.5/sec.
import aiohttp
import asyncio
from os import path,makedirs
caa_base_url = "https://coverartarchive.org/release"
image_download_dir = path.realpath('images')
# small,large, None = Max
image_size = None
async def getImageUrls(release_mbid,session):
async with session.get(f'{caa_base_url}/{release_mbid}') as resp:
if resp.status == 404 or resp.status == 403:
return
return [release_mbid,await resp.json()]
async def getImage(url,session):
try:
async with session.get(url) as resp:
return [url,await resp.read()]
except (aiohttp.ServerDisconnectedError):
return await getImage(url,session)
async def getMBIDs(mb_page_url):
async with aiohttp.ClientSession() as session:
async with session.get(mb_page_url) as resp:
mb_json = await resp.json()
tasks = []
async with aiohttp.ClientSession() as caa_session:
for release in mb_json["releases"]:
task = asyncio.ensure_future(getImageUrls(release["id"],caa_session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
async with aiohttp.ClientSession() as caa_image_session:
for response in responses:
if response is not None:
caaTasks = []
release_mbid = response[0]
result = response[1]
for image in result["images"]:
if image["front"] == True:
caaTask = asyncio.ensure_future(getImage(image["image"],caa_session))
caaTasks.append(caaTask)
image_responses = await asyncio.gather(*caaTasks)
for image_response in image_responses:
image_url = image_response[0]
image_binary = image_response[1]
new_file_dir = path.join(image_download_dir,release_mbid)
if not path.isdir(new_file_dir):
makedirs(new_file_dir)
file_name = image_url[image_url.rfind("/")+1:]
file_path = path.join(new_file_dir,file_name)
new_file = open(file_path,'wb')
new_file.write(image_binary)
mb_base_url = "https://musicbrainz.org/ws/2/release"
num_pages = 100
releases_per_page = 100
mb_page_urls = []
async def getMBPages():
for page_index in range(num_pages):
await getMBIDs('%s?query=*&type=album&format=Vinyl&limit=%s&offset=%s&fmt=json' % (mb_base_url,releases_per_page,page_index*releases_per_page))
await asyncio.sleep(1)
loop = asyncio.get_event_loop()
loop.run_until_complete(getMBPages())
P.S. The sleep is because musicbrainz api limits to 1 request/sec