Basicly I'm trying to take a starting variable run it through my code.
Then add 1 to the variable and rerun the code and repeat till i stop it. Now i have wrote this up but its pretty off from what I'm trying to accomplish. Am i on the right path or completely off?
CODE -
import requests
from bs4 import BeautifulSoup as bs
import re
start = 1
print("starting number: ", start)
i = 0
number = 500
while i < number:
url = "https://randomsite.com/{0}".format(start + i)
try:
print(int(start) + i)
response1 = requests.get(url)
name = re.findall(('analyticsKey":"([^"]+)'), response1.text)
ids = re.findall(('id":"([^"]+)'), response1.text)
print(name)
print("")
print(ids)
print("")
except:
pass
i += 1
Related
First of all I should inform you that I have very little experience in programming. And I have some trouble with the logic and flow of a general webscraper implemented in python. I assume that I should use callbacks and similar methods in order to properly control the process of saving pages from a javascript e-book reader. My script does work, but not consistently. If someone could advice me on improvements that should be made to this script, that would be great. Thank you.
from seleniumwire.utils import decode as sdecode
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options # [!]
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import os.path
opts = Options() # [!]
opts.add_experimental_option('w3c', True) # [!]
capabilities = DesiredCapabilities.CHROME.copy()
driver = webdriver.Chrome(chrome_options=opts, desired_capabilities=capabilities)
url = ' here comes url'
driver.get(url)
def get_requests():
l = []
for rx in driver.requests:
#endmark = '&scale=2&rotate=0' lenght must be 17
if rx.url[-17:]==endmark:
l.append(rx.url)
return list(set(l))
def savepages(diff):
newpages = 0
for urlitem in diff:
for request in driver.requests:
if request.url==urlitem:
#print(request.url)
ind = urlitem.find('.jp2&id') # ex. 0012.jp2&id
file_path = directory_path + '\\' + file_name + urlitem[ind-4:ind] + '.jpg'
tik = 0
while tik<10: #waiting for the response body data
try:
tik += 1
data = sdecode(request.response.body, request.response.headers.get('Content-Encoding', 'identity'))
except AttributeError: # no data error
time.sleep(2) # wait for 2 sec for the data
continue
#data = data.decode("utf-8",'ignore')
# sometimes I get this error 'UnboundLocalError: local variable 'data' referenced before assignment'
# I assumed that the following condition will help but it doesn't seem to work consistently
if data:
with open(file_path, 'wb') as outfile:
outfile.write(data) # sometimes I get UnboundLocalError
else: print('no data')
# was the file saved or not
if os.path.exists(file_path):
newpages += 1 # smth is wrong with the counting logic, since pages+newpages should be equal to the lenght of li=get_requests(), I get more
else:
time.sleep(.5)
return newpages
count = 0 # a counter, should terminate the main delay loop
pages = 0 # counting all saved pages; book pages or images are equivalent, one turn should open 2 new pages/images/requests
oldli = [] #compare to the new list after each delay cycle
turns = 0 #count how many turns have been made or how many times we clicked on the button Next Page
li = get_requests() # get all unique requests of the images/pages, some requests might be still loading, but we manually opened the first page and visually confirmed that there are at least 1 or 3 images/requests
if li: # the program STARTS HERE, first try, there are some requests because we manually opened the first page
# THE MAIN CYCLE should stop when the delay is too long and we turned all the pages of the book
while 2*turns+1<len(li) or count<15: # should terminate the whole program when there is no more images coming
count = 0 #reset counter
success = False #reset success; new pages downloaded successfully
# the main delay counter
# what happens if diff is [] and no success
while True:
count += 1
if count > 14:
print('Time out after more than 10 seconds.')
break
li = get_requests() # in addition, I assume that all requests counting from page 1 will be kept
# it is possible that li will not have some of the old requests and oldli will be longer
# well, I need to keep all old requests in a separate list and then append to it
diff = list(set(li)-set(oldli)) # find new requests after the delay
if diff: # there are some new
npages = savepages(diff) # saves new images and returns the number of them
print('newpages ',npages, ' len diff ', len(diff)) # should be equal
if npages >= len(diff)-1: # we allow one request without a body with data ??
pages += npages # smth is not ok here, the number of pages sometimes exceeds the length of li
success = True # we call it a success
else:
print('Could not save pages. Newpages ', npages, ' len diff ', len(diff))
for pg in diff:
print(pg) # for debuging purposes
break # in this case you break from the delay cycle
else: time.sleep(2) # if no new requests add 2 sec to the waiting time
if success: # we turn pages in case of successful download, this is bad if we need to catch up
while 2*turns+1 < len(li): # if some of old requests are deleted then the program will stop earlier
# it won't wait for the bodies of requests, there is a problem
driver.find_elements(By.CLASS_NAME, "BRicon.book_right.book_flip_next")[0].click()
turns += 1
time.sleep(3) # I got the impression that this doesn't happen
oldli = li
print('pages ',pages,' length of list ',len(li))
break # we break from the delay cycle since success
time.sleep(2) # the main delay timer;; plus no diff timer = total time
else: print('no requests in the list to process') ```
I am a student working on a scraping project and I am having trouble completing my script because it fills my computer's memory with all of the data is stores.
It currently stores all of my data until the end, so my solution to this would be to break up the scrape into smaller bits and then write out the data periodically so it does not just continue to make one big list and then write out at the end.
In order to do this, I would need to stop my scroll method, scrape the loaded profiles, write out the data that I have collected, and then repeat this process without duplicating my data. It would be appreciated if someone could show me how to do this. Thank you for your help :)
Here's my current code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
Data = []
driver = webdriver.Chrome()
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body")
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
# scrape code
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div').text
IssuedBy = "Board of Certified Safety Professionals"
CertificationorDesignaationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]').text
CertfiedorDesignatedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]').text
try:
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a').text
except NoSuchElementException:
AccreditedBy = "N/A"
try:
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]').text
except NoSuchElementException:
Expires = "N/A"
info = Name, IssuedBy, CertificationorDesignaationNumber, CertfiedorDesignatedSince, AccreditedBy, Expires + "\n"
Data.extend(info)
driver.close()
driver.switch_to.window(driver.window_handles[0])
with open("Spredsheet.txt", "w") as output:
output.write(','.join(Data))
driver.close()
Test.py
Displaying Test.py.
Try the below approach using requests and beautifulsoup. In the below script i have used the API URL fetched from website itself for ex:-API URL
First it will create the URL(refer first url) for first iteration, add headers and data in .csv file.
Second iteration it will again create the URL(refer second url) with 2 extra params start_on_page=20 & show_per_page=20 where start_on_page number 20 is incremented by 20 on each iteration and show_per_page = 100 defaulted to extract 100 records per iteration so on till all the data dumped in to the .csv file.second iteration API URL
Script is dumping 4 things number, name, location and profile url.
On each iteration data will be appended to .csv file , so your memory issue will get resolved by this approach.
Do not forget to add your system path in file_path variable where do you want to create .csv file before running the script.
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
import csv
def scrap_directory_data():
list_of_credentials = []
file_path = ''
file_name = 'credential_list.csv'
count = 0
page_number = 0
page_size = 100
create_url = ''
main_url = 'https://directory.bcsp.org/search_results.php?'
first_iteration_url = 'first_name=&last_name=&city=&state=&country=&certification=&unauthorized=0&retired=0&specialties=&industries='
number_of_records = 0
csv_headers = ['#','Name','Location','Profile URL']
while True:
if count == 0:
create_url = main_url + first_iteration_url
print('-' * 100)
print('1 iteration URL created: ' + create_url)
print('-' * 100)
else:
create_url = main_url + 'start_on_page=' + str(page_number) + '&show_per_page=' + str(page_size) + '&' + first_iteration_url
print('-' * 100)
print('Other then first iteration URL created: ' + create_url)
print('-' * 100)
page = requests.get(create_url,verify=False)
extracted_text = bs(page.text, 'lxml')
result = extracted_text.find_all('tr')
if len(result) > 0:
for idx, data in enumerate(result):
if idx > 0:
number_of_records +=1
name = data.contents[1].text
location = data.contents[3].text
profile_url = data.contents[5].contents[0].attrs['href']
list_of_credentials.append({
'#':number_of_records,
'Name':name,
'Location': location,
'Profile URL': profile_url
})
print(data)
with open(file_path + file_name ,'a+') as cred_CSV:
csvwriter = csv.DictWriter(cred_CSV, delimiter=',',lineterminator='\n',fieldnames=csv_headers)
if idx == 0 and count == 0:
print('Writing CSV header now...')
csvwriter.writeheader()
else:
for item in list_of_credentials:
print('Writing data rows now..')
print(item)
csvwriter.writerow(item)
list_of_credentials = []
count +=1
page_number +=20
scrap_directory_data()
I am trying to extract certain information from a long list of text do display it nicely but i cannot seem to figure out how exactly to tackle this problem.
My text is as follows:
"(Craw...Crawley\n\n\n\n\n\n\n08:00\n\n\n\n\n\n\n**Hotstage**\n **248236**\n\n\n\n\n\n\n\n\n\n\n\n\n\nCosta Collect...Costa Coffee (Bedf...Bedford\n\n\n\n\n\n\n08:00\n\n\n\n \n\n\n**Hotstage**\n **247962**\n\n\n\n\n\n\n\n\n\n\n\n\n\nKFC - Acrelec Deployment...KFC - Sheffield Qu...Sheffield\n\n\n\n\n\n\n08:00\n\n\n\n\n\n\nHotstage\n 247971\n\n\n\n\n\n\n\n\n\n\n\n\n\nKFC - Acrelec Deployment...KFC - Brentford...BRENTFORD\n\n\n\n\n\n\n08:00\n\n\n\n\n\n\nHotstage\n 248382\n\n\n\n\n\n\n\n\n\n\n\n\n\nKFC - Acrelec Deployment...KFC - Newport"
I would like to extract what is highlighted.
I'm thinking the solution is simple and maybe I am not storing the information properly or not extracting it properly.
This is my code
from bs4 import BeautifulSoup
import requests
import re
import time
def main():
url = "http://antares.platinum-computers.com/schedule.htm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
response.close()
# Get
tech_count = 0
technicians = [] #List to hold technicians names
xcount = 0
test = 0
name_links = soup.find_all('td', {"class": "resouce_on"}) #Get all table data with class name "resource on".
# iterate through html data and add them to "technicians = []"
for i in name_links:
technicians.append(str(i.text.strip())) # append value to dictionary
tech_count += 1
print("Found: " + str(tech_count) + " technicians + 1 default unallocated.")
for t in technicians:
print(xcount,t)
xcount += 1
test = int(input("choose technician: "))
for link in name_links:
if link.find(text=re.compile(technicians[test])):
jobs = []
numbers = []
unique_cr = []
jobs.append(link.parent.text.strip())
for item in jobs:
for subitem in item.split():
if(subitem.isdigit()):
numbers.append(subitem)
for number in numbers:
if number not in unique_cr:
unique_cr.append(number)
print ("tasks for technician " + str(technicians[test]) + " are as follows")
for cr in unique_cr:
print (jobs)
if __name__ == '__main__':
main()
It's fairly simple:
myStr = "your complicated text"
words = mystr.split("\n")
niceWords = []
for word in words:
If "**"in word:
niceWords.append(word.replace("**", "")
print(niceWords)
I'am trying to build a web crawler using beautifulsoup and urllib. The crawler is working, but it does not open all the pages in a site. It opens the first link and goes to that link, opens the first link of that page and so on.
Here's my code:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urljoin
import json, sys
sys.setrecursionlimit(10000)
url = input('enter url ')
d = {}
d_2 = {}
l = []
url_base = url
count = 0
def f(url):
global count
global url_base
if count <= 100:
print("count: " + str(count))
print('now looking into: '+url+'\n')
count += 1
l.append(url)
html = urlopen(url).read()
soup = BeautifulSoup(html, "html.parser")
d[count] = soup
tags = soup('a')
for tag in tags:
meow = tag.get('href',None)
if (urljoin(url, meow) in l):
print("Skipping this one: " + urljoin(url,meow))
elif "mailto" in urljoin(url,meow):
print("Skipping this one with a mailer")
elif meow == None:
print("skipping 'None'")
elif meow.startswith('http') == False:
f(urljoin(url, meow))
else:
f(meow)
else:
return
f(url)
print('\n\n\n\n\n')
print('Scrapping Completed')
print('\n\n\n\n\n')
The reason you're seeing this behavior is due to when the code recursively calls your function. As soon as the code finds a valid link, the function f gets called again preventing the rest of the for loop from running until it returns.
What you're doing is a depth first search, but the internet is very deep. You want to do a breadth first search instead.
Probably the easiest way to modify your code to do that is to have a global list of links to follow. Have the for loop append all the scraped links to the end of this list and then outside of the for loop, remove the first element of the list and follow that link.
You may have to change your logic slightly for your max count.
If count reaches 100, no further links will be opened. Therefore I think you should decrease count by one after leaving the for loop. If you do this, count would be something like the current link depth (and 100 would be the maximum link depth).
If the variable count should refer to the number of opened links, then you might want to control the link depth in another way.
Hello I'm learning how to parse HTML with BeautifulSoup. I would like to know if it is possible to use a user input in a for loop, as:
for (user input) in A
As A is a list of links so the user can choose to go for a link, using an input.
And then I use urllib to open that link and repeat the process.
You can use something like this:
import urllib2
from bs4 import BeautifulSoup
choice = ''
for url in urls:
print('Go to {}?'.format(url))
decision = input('Y/n ')
if decision == 'Y':
choice = url
break
if choice:
r = urllib2.urlopen(choice).read()
soup = BeautifulSoup(r, 'lxml')
# do something else
It wasn't exactly clear to me if you really wanted to "open" the link in a browser, so I included some code to do that. Is this maybe what you wanted from "digit a position"?
tl;dr
print("Which URL would you like to open?"
" (Please select an option between 1-{})".format(len(A)))
for index, link in enumerate(A):
print index+1, link
Full:
from bs4 import BeautifulSoup
import requests
import webbrowser
A = [
'https://www.google.com',
'https://www.stackoverflow.com',
'https://www.xkcd.com',
]
print("Which URL would you like to open?"
" (Please select an option between 1-{})".format(len(A)))
for index, link in enumerate(A):
print index+1, link
_input = input()
try:
option_index = int(_input) - 1
except ValueError:
print "{} is not a valid choice.".format(_input)
raise
try:
selection = A[option_index]
except IndexError:
print "{} is not a valid choice.".format(_input)
raise
webbrowser.open(selection)
response = requests.get(selection)
html_string = response.content
# Do parsing...
Thanks for your help. I achieved a solution on this.
Created two variables: count = input() and postion = input()
The count I have used in a for loop: for _ in range(c) - with this I can made a process repeat the number of times that the user want (on this assignement is 4).
The position (that for this assignement is predefined on 3), I use for list index, in a list with all url. So for open the url in position 3 I have:
url = links[p-1] (-1 for the reason that user inputs 3, but the list index starts with 0 (0,1,2...)
And then I can use urllib.request.urlopen.read()