I'm very new to python, but I've made a lot of progress over the last few days. The below script works fine, but I just can't figure out how implement code that would print an incremented number every time 'avail' is equal to NO. I'd like to have it print something like 'None Available 1' on the first loop, then 'None Available 2' on the second loop, then 'None Available 3' on the third loop, etc..
import requests
import time
import subprocess
from bs4 import BeautifulSoup
def get_page(url):
response = requests.get(url)
if not response.ok:
print('Server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
avail = soup.find('span', id='availability').text.strip()
if avail == "YES":
return True
elif avail == "NO":
print('None Available')
return False
else:
print("Unexpected value")
return None
def main():
url ='https://www.blahblah.com'
while True:
is_available = get_detail_data(get_page(url))
if is_available:
subprocess.call(["C:\\temp\\filename.bat"], shell=False)
break
time.sleep(2)
if __name__ == '__main__':
main()
The following would probably work, but there might be a better way to structure it.
_not_avail_counter = 0
def get_detail_data(soup):
avail = soup.find('span', id='availability').text.strip()
if avail == "YES":
return True
elif avail == "NO":
_not_avail_counter += 1
print('None Available ' + str(_not_avail_counter))
return False
else:
print("Unexpected value")
return None
I would suggest changing your while True loop into a for loop on an itertools.count iterator. You can pass the value from the count to the get_detail_data function with an argument.
import itertools
def get_detail_data(soup, count): # take the count as an argument
avail = soup.find('span', id='availability').text.strip()
if ...
# ...
elif avail == "NO":
print('None Available', count) # include count here (and anywhere else you want)
# ...
def main():
url ='https://www.blahblah.com'
for c in itertools.count(): # produce the count in a loop
is_available = get_detail_data(get_page(url), c)
# ...
Note that itertools.count starts counting a zero. If you want to start at 1 (like a human usually would when counting things), you may want to pass 1 as the start argument: for c in itertools.count(1).
Related
I'm making a wikipedia crawler but it's very slow. How can I make it faster?
I'm using requests module and beautifulsoup4 to parse the html pages. I've tried implementing multithreading but it's still slow.
import requests
from bs4 import BeautifulSoup as bs
from queue import Queue
baseURL = "https://en.wikipedia.org";
startURL = "/wiki/French_battleship_Courbet_(1911)"
endURL = "/wiki/Royal_Navy"
tovisit = Queue()
visited = []
def main():
if (not checkValid(startURL)) or (not checkValid(endURL)):
print("Invalid URLs entered.")
quit()
initCrawler(startURL)
def initCrawler(startURL):
global tovisit
global visited
tovisit.put(startURL)
finished = False
while not finished:
if tovisit.empty():
finished = True
continue
url = tovisit.get()
childlinks = linkCrawl(url)
for i in childlinks:
tovisit.put(i)
visited.append(url)
def linkCrawl(url):
global visited
global tovisit
global endURL
print("crawling "+ url + "\n")
r = requests.get(baseURL+url)
soup = bs(r.content, "html.parser")
rawlinks = soup.find_all('a', href=True)
refinedlinks = []
for rawLink in rawlinks:
i = rawLink["href"]
if i is None:
continue
# ensure what we have is a string
if not (type(i) is str):
continue
# no poi
if i in visited:
continue
if i in list(tovisit.queue):
continue
if not checkValid(i):
continue
if i == endURL:
print("yay")
exit()
refinedlinks.append(i)
return refinedlinks
def checkValid(url):
if not url.startswith("/wiki/"):
return False
if url.startswith("/wiki/Special:"):
return False
if url.startswith("/wiki/Wikipedia:"):
return False
if url.startswith("/wiki/Portal:"):
return False
if url.startswith("/wiki/File:"):
return False
if url.endswith("(disambiguation)"):
return False
return True
if __name__ == "__main__":
main()
I expect the bot to run faster, but it's actually slow. Research says that eventually multithreading won't be enough.
import requests
from bs4 import BeautifulSoup
import time
import sys
url = "https://www.doviz.com/"
response = requests.get(url)
html_icerigi = response.content
soup = BeautifulSoup(html_icerigi,"html.parser")
isimler = soup.find_all("span",{"class":"menu-row1"})
degerler = soup.find_all("span",{"class":"menu-row2"})
islem = input("Lütfen işleminizi giriniz...")
time.sleep(1)
for isim,deger in zip(isimler,degerler):
isim = isim.text
deger = deger.text
isim = isim.strip()
deger = deger.strip()
isim = isim.replace("\n","")
deger = deger.replace("\n","")
print(isim,deger)
while True:
if islem == "q" or islem == "Q":
print("Programdan çıkılıyor...")
time.sleep(1)
elif islem == "1":
print("1 Gr Altın = {} TLdir.".format(deger[0]))
My questions are:
When i run this code because of the while loop inside of the for loop it gives an output like:
1 Gr Altın = 2 TLdir.
1 Gr Altın = 2 TLdir.
continuously. How can I avoid that?
deger[0] returns just the first digit of the real value of gram of gold. I want all of it like 216,370 USD.
By the way im an absolute beginner to Python.
Best Regards.
I see no need for the while True: loop. Dropping that will get rid of problem #1
Similarly, it looks like you are indexing the deger string. The 0 index of a string is it's first letter. Dropping the indexing will return the entire value.
EDIT:
From your comments, it looks like you are trying to make a list that you can index to pull up the specific value of that index. In order to do this, you need to make a list of the values.
look_up = []
for isim,deger in zip(isimler,degerler):
isim = isim.text
deger = deger.text
isim = isim.strip()
deger = deger.strip()
isim = isim.replace("\n","")
deger = deger.replace("\n","")
look_up.append(isim + "\t" + deger)
print(isim,deger)
# dedent to remove from `for` loop
if islem == "q" or islem == "Q":
print("Programdan çıkılıyor...")
time.sleep(1)
else:
print("1 Gr Altın = {} TLdir.".format(look_up[int(islem)]))
Note: this will only allow one lookup. If you want it to allow for more than one lookup, you will need to institute a sentinel loop whereby the loop continues until "q" is pressed.
Additionally, you may be better served utilizing a dictionary look_up = {} and populating it through look_up[isim] = deger. That way you can just look up the info by the isim value. i.e DOLAR or EURO
I hope this is enough information to get you where you are trying to go. I trust you can use the info given here to solve your specific use case.
So what I want to do is that I want to make a sorts of monitor a website that pick ups a random number. Before it does it needs to requests to the website to see whenever it is valid or not. When it is live it will generate random numbers 1-100 and I want it to check every random 3-6 second and then print again the number and repeat until the website is down.
What I have tried to do is following:
def get_product_feed(url_site):
thread = url_site
password = False #We start by giving a false/true value
while not password: #If it is not True then we do the for loop
available = False
while not available:
try:
checkpass = requests.get(thread, timeout=12) #We requests the site to see if its alive or not
if ('deadsite' in checkpass.url): #If it contains etc deadsite then we enter the while True
while True:
contcheck = requests.get(thread,timeout=12) #We make new requests to see if its dead.
if ('deadsite' in contcheck.url): #if it is, then we sleep 3-7sec and try again
randomtime = random.randint(3, 7)
time.sleep(randomtime)
else: #If its not containig it anymore then we send the bs4 value
available = True
bs4 = soup(contcheck.text, 'lxml')
return bs4
break
else: #If its having either of them then we send instant.
bs4 = soup(contcheck.text, 'lxml')
return bs4
break
except Exception as err:
randomtime = random.randint(1, 2)
time.sleep(randomtime)
continue
def get_info(thread):
while True:
try:
url = thread
resp = requests.get(url, timeout=12) #We requests to the website here
resp.raise_for_status()
json_resp = resp.json() #We grab the json value.
except Exception as err:
randomtime = random.randint(1,3)
time.sleep(randomtime)
continue
metadata = json_resp['number'] #We return the metadata value back to get_identifier
return metadata
def get_identifier(thread):
new = get_info(thread) #We requests the get_info(thread):
try:
thread_number = new
except KeyError:
thread_number = None
identifier = ('{}').format(thread_number) #We return back to script
return identifier
def script():
url_site = 'https://www.randomsitenumbergenerator.com/' #What url we gonna use
old_list = []
while True:
for thread in get_product_feed(url_site): #We loop to see through get_product_feed if its alive or not
if get_identifier(thread) not in old_list: # We then ask get_identifier(thread) for the values and see if its in the old_list or not.
print(get_identifier(thread)
old_list.append(get_identifier(thread)
I added a comment to make it easier to understand what is going on.
The issue I am having now that I am not able to make get_identifier(thread) to run until the website is down and I want it to continue to print out until the website is live til it dies and that is my question! What do I need to do to make it happen?
My thoughts was to add eventually threads maybe that 10 threads are checking at the same time to see if the website is dead or not and give back the value as a print but I am not sure if that is my solution for the question.
I'm passing a link as an argument in a thread, that I want to scrape the timestamp on. But in the function that the thread is pointing to, the timestamp value does not change, every time i'm rescraping it. How do you get timeLink to be dynamic and change every time it goes over the while loop? Here is the code:
def abcStart(timeLink):
while True:
res = timeLink
res.raise_for_status()
timestamp = BeautifulSoup(res.content, 'html.parser').find_all('b')
if timestamp[0].text == otherTimestamp[0].text:
work on something
break
if timestamp[0].text > otherTimestamp[0].text:
continue
else:
print('not yet')
time.sleep(30)
break
timelink = requests.get('http://example.com/somelink')
threadobj = threading.Thread(target=abcStart, args=(timelink))
threadobj.start()
threadobj.join()
It looks like there is only one http request being sent. On this line:
timelink = requests.get('http://example.com/somelink')
the abcStart() function is receiving the http response, and using that one value the whole time it is running. This will cause us to scrape the same page every time. If we want to have a different page to scrape for each loop iteration, we need to perform another http request each time. Something like this:
def abcStart(timeLink):
while True:
res = requests.get(timeLink) # send request here
res.raise_for_status()
timestamp = BeautifulSoup(res.content, 'html.parser').find_all('b')
if timestamp[0].text == otherTimestamp[0].text:
work on something
break
if timestamp[0].text > otherTimestamp[0].text:
continue
else:
print('not yet')
time.sleep(30)
break
timeLink = 'http://example.com/somelink' # declare url
threadobj = threading.Thread(target=abcStart, args=(timelink))
threadobj.start()
threadobj.join()
I guess you should move timeLink request inside your function:
def abcStart(timeLink):
while True:
res = requests.get('http://example.com/somelink')
res.raise_for_status()
timestamp = BeautifulSoup(res.content, 'html.parser').find_all('b')
if timestamp[0].text == otherTimestamp[0].text:
work on something
break
if timestamp[0].text > otherTimestamp[0].text:
continue
else:
print('not yet')
time.sleep(30)
break
threadobj = threading.Thread(target=abcStart, args=())
threadobj.start()
threadobj.join()
I am trying to make an on/off switch for my program:
(see after the ### for what I'm talking about)
while 1:
str = raw_input("insert your word: ")
n = input("insert your scalar: ")
def string_times(str, n):
return n * str
print string_times(str, n)
###
def switch(on,off):
raw_input("On or off? ")
if switch == "on":
continue
if switch == "off":
break
switch(on,off)
I get a continue not in loop error. Basically, I want to create an on or off switch after the program runs once. What do I fix?
You cannot use break and continue in a nested function. Use the return value of the function instead:
def switch():
resp = raw_input("On or off? ")
return resp == "on":
while True:
# other code
if not switch():
break
Note that there is little point in defining your functions in the loop. Define them before the loop, as creating the function object takes some performance (albeit a small amount).
The switch() function needs no arguments (you didn't use them at all), and the continue is also not needed. If you didn't break out of the loop, it'll just continue from the top when you reach the end.
You only need continue if you want the loop to start at the top again skipping the rest of the code in the loop:
count = 0
while True:
count += 1
print count
if loop % 2 == 0:
continue
print 'We did not continue and came here instead.'
if count >= 3:
break
print 'We did not break out of the loop.'