I'm trying to get the time it takes a page to fully load (like the Finish in Google's Chrome Dev Tools). I wrote something in Python but I get really low time results, way less than a second which is not realistic. This is what I have:
from urllib.request import urlopen
from time import time
class Webpage:
def __init__(self, pageName, pageUrl):
self.pageName = pageName
self.pageUrl = pageUrl
class LoadingDetail:
def __init__(self, webPage, loadingTimes): #Getting the webpage object, and it's loading times
self.webPage = webPage
self.loadingTimes = loadingTimes
pages = [
Webpage("test", "URL"),
Webpage("test2", "URL"),
Webpage("test3", "URL"),
Webpage("test4", "URL"),
]
loadingDeatils = []
for page in pages: #Going through each page in the array.
pageLoadTimes = [] #Storing the time it took the page to load.
for x in range(0, 3): #Number of times we request each page.
stream = urlopen(page.pageUrl)
startTime = time()
streamRead = stream.read()
endTime = time()
stream.close()
timeToLoad = endTime - startTime #The time it took to read the whole page.
pageLoadTimes.append(timeToLoad)
loadDetails = LoadingDetail(page, pageLoadTimes)
loadingDeatils.append(loadDetails)
I get results like 0.00011. I searched but only found Selenium based answers which I can't use.
Is there an option to do it which Python only? is Python the right tool for this? I saw answers with JS that seems to be exactly what I was looking for.
I tried it and was able to measure the time with the below code, this might help
reference: geeksforgeeks.org/timeit-python-examples/
import timeit
mysetup = "from urllib.request import urlopen"
mycode = '''
urlopen('http://www.python.org')
'''
print(timeit.timeit(setup = mysetup, stmt = mycode, number = 1))
Related
I am writing this python script to scrap a website for collecting information. After I entered the date and the no. of game on that date, this script will help me to go to the site and scrap a specific table and save each game into a CSV file. There are 12 rows of the tables, so that's why you see I have hardcoded it in the for loop.
The script works but I would like to seek for the suggestion from you experts to optimize and speed up the script. I thought using concurrent would speed up but it doesn't give a obvious improvement.
It would be great if anyone can help. At the moment, this script would take 20-30 seconds to complete for one date scrap.
Thank you very much for your time!
import concurrent.futures
import pandas as pd
from requests_html import HTMLSession
import requests_cache
session = HTMLSession()
requests_cache.install_cache(expire_after=3600)
game_date = input("Please input the date of the game that you want to scrap (in YYYY/MM/DD): ")
game_no = int(input("Please input how many game on that date: "))
def split_list(big_list, chunk_size):
return [big_list[i:i + chunk_size] for i in range(0, len(big_list), chunk_size)]
def get_game_result(game):
print(f"Processing game {game}")
url = f"https://example.com{game_date}&{game}" <<< example link
response = session.get(url)
response.html.render(sleep=5, keep_page=True, scrolldown=1)
row_body = response.html.xpath(f"/html/body/div[1]/div[3]/div[2]/div[2]/div[2]/div[5]/table/tbody/tr[1]")
final_list = []
for i in range(2, 14):
for item in row_body:
item_table = item.text.split("\n")
final_list.append(item_table)
row_body = response.html.xpath(f"/html/body/div[1]/div[3]/div[2]/div[2]/div[2]/div[5]/table/tbody/tr[{i}]")
i += 1
df = pd.DataFrame(final_list)
df.to_csv(f"game_{game}.csv", index=False, header=False)
print(f"Finished processing game {game}")
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
results = [executor.submit(get_game_result, game) for game in range(1, game_no + 1)]
for f in concurrent.futures.as_completed(results):
f.result()
I have made a web-scrap script in Python. The job is to go through many sofascore.com pages to gather information. Furthermore, I am using Beutilfulsoup and playwright to do the job.
However, when the loop has started through all my Sofascore pages, there exists 2 types of situations. The first type lets me gather all information and the second type do not let me gather the information. I have researched both types of pages and they have the same elements. My code is:
from time import time
from numpy import true_divide
from playwright.sync_api import sync_playwright
import pandas as pd
from bs4 import BeautifulSoup
import time
from selenium import webdriver
HomeGoal= []
AwayGoal = []
HomeTeam = []
AwayTeam = []
with sync_playwright() as p:
#headless = False, slow_mo=50
browser = p.chromium.launch(headless = False, slow_mo=50)
page = browser.new_page()
page.goto(THEPAGES)
time.sleep(1)
page.is_visible('//div[contains(#class, "sc-18688171-0 sc-7d450bff-4 fXAhuT fBSHnS")]')
HTML = page.inner_html('//div[contains(#class, "sc-cd4cfbdc-0 hDkGff")]')
Soup= BeautifulSoup(HTML, 'html.parser')
NotFirtst = 0
for I in Soup:
if len(I.text) > 0 and EnGang != 1:
NotFirtst = NotFirtst + 1
Home = I.text.rsplit(" - ",1)[0]
Away = I.text.rsplit(" - ",1)[1]
#This below will gather information about the matches
HMTL = page.inner_html('//div[contains(#class, "sc-4b793701-0 dTwLyM u-overflow-hidden")]')
Soup= BeautifulSoup(HMTL,'html.parser')
#The information for previous matches
for I in Soup.find_all(class_= "sc-c2090177-0 dLUwVT"):
print(I.text)
#Information is gathered
This code is working fine with pages such as:
https://www.sofascore.com/ymir-kopavogur-knattspyrnufelag-rangaeinga/EvvsEIO
But is not working on pages like https://www.sofascore.com/gimnasia-y-esgrima-csyd-liniers/fobsQgCb
Using this code to test if all information is gathered. Informs me that it gathered all on the first page, but not all on the second page:
HMTL = page.inner_html('//div[contains(#class, "sc-4b793701-0 dTwLyM u-overflow-hidden")]')
Soup= BeautifulSoup(HMTL,'html.parser')
print(Soup)
In my world, this should work fine and I cannot find anywhere where this problem occurs for others, when the elements exists on both pages.
I recently updated the Selenium Web Driver for one of the old apps because it wasn't working.
Now, it seems like the driver is functional, however, the script doesn't fully do what it is supposed to. Please help, perhaps there is an error with logic inside main function but as mentioned, this used to work before.
Essentially, I have switcher.py app that swaps three different browser tabs with a certain specified delay. I specify the urls in urls.txt where two addresses are just links and the other one is file://srv13nas1/departmental/3_Operations/350_Assembly/department/projects/PLL01_Ramp-up/08_Visualization/Presentations/ which contains all of the PNGs of the user. Switcher.py is supposed to pick a presentation png based on the Serial number value for the WorkCenter which we get from either one of two csv files. If no serial number was found, it would default to the default image with our company's logo. Check main function to see where it supposed to happen.
For some reason, this last step with defaulting to a certain image stopped working and I am not sure what fixes to apply. It has a strange behavior: it just says "File not found". I know that the file with the appropriate name is not in the specified directory but it had to default to default image in that case.
Here is the source code for the switcher.py:
from os.path import join , dirname
from datetime import datetime , timedelta
from selenium import webdriver
from time import sleep
import pandas as pd
import sys
ROOT = dirname(__file__)
PATH_P = join(r"\\srvditz1\DataXchange_R3\TUS_EXCHANGE", "ZPP00141_TUS350.csv")#join("data", "zpp00141_all.csv")
PATH_E = join(r"\\srvditz1\DataXchange_R3\TUS_EXCHANGE", "ZPP00138_TUS350.csv")#join("data", "sample_data1.csv")
DEFAULT_IMAGE = r'file://srv13nas1/departmental/3_Operations/350_Assembly/department/projects/PLL01_Ramp-up/08_Visualization/Presentations/Push_Slide_1.png'
DRIVER_PATH = join(
ROOT ,
"resources" ,
"msedgedriver.exe" ,
)
URLS_FILE = sys.argv[1] if len(sys.argv) == 2 else "urls.txt"
URLS_PATH = join(
ROOT ,
URLS_FILE ,
)
URLS = [x.strip() for x in open(URLS_PATH).readlines() if x.strip() != ""]
IS_LOOP = True
#Specify the delay value in seconds
DELAY = 20
def fix_urls (
):
wc = URLS[0].split('=')[-1]
#Read dfe(executed)
dfe = pd.read_csv(PATH_E, sep=";")
#Filter Final confirmation
filt = (dfe['Final Confirmation'] == 'X')
dfe = dfe.loc[filt]
#Read dfp(planed)
dfp = pd.read_csv(PATH_P, sep=";")
#Try to find the wc in the planned dataset, if not, then try the current dataset
try :
x = dfp[dfp["Work Center"]==wc]["Serial Number"].values[0]
except :
x = dfe[dfe["Work Ctr"]==wc]["Goods recipient"].values[0]
URLS[-1] = URLS[-1] + x + ".png"
def g_driver (
):
#Create a selenium webdriver
driver = webdriver.Edge(DRIVER_PATH)
#For each url, create an empty tab
for i in range(len(URLS)-1) :
driver.execute_script("window.open('');")
return driver
def wait (
#Waits until the n th second of a minute
n ,
):
#Get the previous refresh time
last_time = datetime.now()
last_time = last_time.replace(second= last_time.second - last_time.second % n)
future_time = last_time + timedelta(seconds= n)
diff = future_time - datetime.now()
sleep(diff.total_seconds())
def switch (
#
driver ,
):
#Keep running
while IS_LOOP :
#For each url
for i in range(len(URLS)) :
#switch to the tab
driver.switch_to.window(driver.window_handles[i])
#Refresh the url
driver.get(URLS[i])
#Stay on the page for a while
wait(DELAY)
driver.delete_all_cookies()
def main():
#Get the png path
try :
fix_urls()
except:
URLS[-1] = DEFAULT_IMAGE
#Create the browser instance
driver = g_driver()
#Start switching tabs
switch(driver)
if __name__ == "__main__" :
main()
I need to write a loop so that the parser collects data from all pages, but my version does not work, how could I implement it differently?
import time
import pandas as pd
from selenium.webdriver import Chrome
from datetime import datetime
webdriver = r"C:\Users\К.Бояр (Второй)\source\repos\RozetaParcer\chromedriver.exe"
driver = Chrome(webdriver)
driver.implicitly_wait(10)
driver.get("https://rozetka.com.ua/search/?producer=gazer&seller=rozetka&text=Gazer")
total = []
items = driver.find_elements_by_css_selector(".goods-tile.ng-star-inserted")
cur_date = datetime.now().strftime("%d_%m_%Y")
for item in items:
t_name = item.find_element_by_css_selector('.goods-tile__title').text
t_price = item.find_element_by_css_selector('.goods-tile__price-value').text
t_nal = item.find_element_by_css_selector('.goods-tile__availability').text
row = cur_date, t_name, t_price, t_nal
total.append(row)
driver.close()
df = pd.DataFrame(total, columns=['Date','Name', 'Price', 'Nal'])
df.to_csv(f'Rozetka_parcer_{cur_date}.csv')
you have to get button with .pagination__direction_type_forward in a while loop until button get disabled and gray ( this means you are at last page ) on that while loop you get items before you click on the next page button
there to many way to approach this but the easiest imo is this ( and this problems its different for every websites because they are different in tech they used and the html they have )
I need to get 1, a list of video links of a playlist and 2, list of video names of a playlist. This is what I am doing.
from pytube import YouTube, Playlist
playlist_link = "https://www.youtube.com/playlist?list=PLJKfZ_cKGyLdYqdzGLCJPbsi9UGCcEc5e"
video_links = Playlist(playlist_link).video_urls
video_titles = []
for link in video_links:
video_titles.append(YouTube(link).title)
While this works, getting all the titles takes forever because each link has to be converted to a YouTube object, is there a faster way to do this?
I looked at your code and it takes around 22 seconds to execute.
from time import time
from pytube import YouTube, Playlist
playlist_link = "https://www.youtube.com/playlist?list=PLJKfZ_cKGyLdYqdzGLCJPbsi9UGCcEc5e"
video_links = Playlist(playlist_link).video_urls
video_titles = []
start = time()
for link in video_links:
video_titles.append(YouTube(link).title)
print(f'Time taken: {time() - start}')
# output
Time taken: 21.815414667129517
You can reduce the execution time by adding multithreading. The following code takes about 3 seconds to execute.
from time import time
from pytube import YouTube, Playlist
from concurrent.futures import ThreadPoolExecutor, as_completed
playlist_link = "https://www.youtube.com/playlist?list=PLJKfZ_cKGyLdYqdzGLCJPbsi9UGCcEc5e"
video_links = Playlist(playlist_link).video_urls
start = time()
def get_video_title(link):
title = YouTube(link).title
return title
processes = []
with ThreadPoolExecutor(max_workers=10) as executor:
for url in video_links:
processes.append(executor.submit(get_video_title, url))
video_titles = []
for task in as_completed(processes):
video_titles.append(task.result())
print(task.result())
print(f'Time taken: {time() - start}')
# output
Time taken: 2.7463150024414062