Why won't listfunction iterate through urls? - python

I am trying to write a program that opens a url, finds a name in a certain line, and saves it. Then it should find the url in the same line as the name, open it, and find the name + url in the same line # as the previous page. It should do this 4 times.
I can't get it to iterate through the new url parameter. It keeps returning the same name and url. What is going wrong here? Thanks.
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import ssl
linklist = list()
namelist = list()
linelist = list()
count = 0
listposition = int(input("Please enter list position: "))
goodnamelist = list(["Fikret"])
nexturl = "http://py4e-data.dr-chuck.net/known_by_Fikret.html"
def listfunction(url):
ctx = ssl.create_default_context()
#Allows reading of HTTPS pages
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
linelist = soup('a')
for line in linelist:
#Creates list of lines in webpage:
linklist.append(re.findall("(http://.+)\"", str(line)))
#Creates list of names in line:
namelist.append(re.findall(">(.+)</a>", str(line)))
#Creates list of names in the designated user-input position:
goodnamelist.append(namelist[listposition][0])
nexturl = linklist[listposition][0]
return nexturl
while (count < 4):
nexturl = listfunction(nexturl)
print(listfunction(nexturl))
count += 1
print(nexturl)
continue
print(linelist)
print(linklist)
print(namelist)
print(nexturl)
print(goodnamelist)
print(listfunction(nexturl))

You do not actually set nexturl in listfunction(). Therefore the method just returns the same initial global variable every time.

Related

Saving a "for loop" iteration

When I run the code below, the for loop saves the first text correctly into a separate file, but the second iteration saves the first AND the second into another separate file, and the third iteration saves the first, second and third into a separate file and so on.... I'd like to save each iteration into a separate file but not adding the previous iterations. I don't have a clue to what I'm missing here. Can anyone help, please?
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'http://www.chakoteya.net/StarTrek/'
end_url = ['1.htm', '6.htm', '8.htm', '2.htm', '7.htm',
'5.htm', '4.htm', '10.htm', '12.htm', '11.htm', '3.htm', '16.htm']
episodes = []
count = 0
for end_url in end_url:
url = base_url + end_url
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
episodes.append(soup.text)
file_text = open(f"./{count}.txt", "w")
file_text.writelines()
file_text.close()
count = count + 1
print(f"saved file for url:{url}")
Please consider the following points!
there's no reason at all to use bs4! since response.text is actually holding the same.
You've to use Same Session explained on my previous answer
You can use iteration with fstring/format which will let your code more cleaner and easier to read.
with context manager is less headache as you don't need to remember to close your file after!
import requests
block = [9, 13, 14, 15]
def main(url):
with requests.Session() as req:
for page in range(1, 17):
if page not in block:
print(f'Extracing Page# {page}')
r = req.get(url.format(page))
with open(f'{page}.htm', 'w') as f:
f.write(r.text)
main('http://www.chakoteya.net/StarTrek/{}.htm')
You needed to empty your episodes for each iteration. Try the following:
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'http://www.chakoteya.net/StarTrek/'
end_url = ['1.htm', '6.htm', '8.htm', '2.htm', '7.htm',
'5.htm', '4.htm', '10.htm', '12.htm', '11.htm', '3.htm', '16.htm']
count = 0
for end_url in end_url:
episodes = []
url = base_url + end_url
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
episodes.append(soup.text)
file_text = open(f"./{count}.txt", "w")
file_text.writelines(episodes)
file_text.close()
count = count + 1
print(f"saved file for url:{url}")
It doesn't appear that your code would save anything to the files at all as you are calling writelines with no arguments
if __name__ == '__main__':
import requests
from bs4 import BeautifulSoup
base_url = 'http://www.chakoteya.net/StarTrek/'
paths = ['1.htm', '6.htm', '8.htm', '2.htm', '7.htm',
'5.htm', '4.htm', '10.htm', '12.htm', '11.htm', '3.htm', '16.htm']
for path in paths:
url = f'{base_url}{path}'
filename = path.split('.')[0]
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
with open(f"./{filename}.txt", "w") as f:
f.write(soup.text)
print(f"saved file for url:{url}")
This is reworked a little. It wasn't clear why the data was appending to episodes so that was left off.
Maybe you were writing the list to the file which would account for dupes. You were adding the content to each file to a list and writing that growing list each iteration.

scraper filters out word's instead of line

So I was trying to make a filter that filter's out the crap from this scrape, but I have an issue where it filters out the words. I would like to filter out the whole line instead of the words.
from bs4 import BeautifulSoup
import requests
import os
def Scrape():
page = input("Page: ")
url = "https://openuserjs.org/?p=" + page
source = requests.get(url)
soup = BeautifulSoup(source.text,'lxml')
os.system('cls')
Filter(soup)
def Filter(soup):
crap = ""
f = open("Data/Crap.txt", "r")
for craptext in f:
crap = craptext
for Titles in soup.select("a.tr-link-a>b"):
print(Titles.text.replace(crap, "").strip())
while True:
Scrape()
Instead of:
print(Titles.text.replace(crap, "").strip())
Try using:
if crap not in Titles.text:
print(Titles.text.strip())

List index out of range with web scraping

my code has the list index out of range error.
import requests
from bs4 import BeautifulSoup
import re
import pyperclip
# import pandas as pd
import csv
# Get a name of the agency
def getAgency(pageURL):
res = requests.get(pageURL)
res.raise_for_status()
soup = BeautifulSoup(res.text, 'html.parser')
elems = soup.select('h3.company-name > a')
names = []
for i in range(len(elems)):
names.append(str(elems[i].text.strip()))
return names
def getWebsite(pageURL):
res = requests.get(pageURL)
res.raise_for_status()
soup = BeautifulSoup(res.text, 'html.parser')
elems = soup.select('li.website-link.website-link-a > a')
sites = []
for elem in elems:
if elem.find('/your-project') != -1:
elems.remove(elem)
else:
pass
for i in range(len(elems)):
sites.append(str(elems[i]["href"]))
return sites
allNames = []
for pagenumber in range(0,1):
names = getAgency('https://clutch.co/agencies/digital?page=' + str(pagenumber))
allNames += names
allSites = []
for pagenumber in range(0,1):
sites = getWebsite('https://clutch.co/agencies/digital?page=' + str(pagenumber))
allSites += sites
final = []
with open('text.csv', 'w', newline='') as f:
a = csv.writer(f, delimiter=',')
for index in range(len(allNames)):
final.append(",".join([allNames[index].replace(",", " "), allSites[index]]))
a.writerows(final)
finalresult = "\n".join(final)
pyperclip.copy(finalresult)
I know what causes this error, this if statement.
for elem in elems:
if elem.find('/your-project') != -1:
elems.remove(elem)
else:
pass
When I delete an item from the list, the index number decreases by 1. And so, in this for loop, the index will be the length of the allNames, and I used the same index with the allSites. So, when the index will be the last item of allNames, the allSites will throw an error because it will be out of range. What can I do to solve this problem?
for index in range(len(allNames)):
final.append(",".join([allNames[index].replace(",", " "), allSites[index]]))
a.writerows(final)
You need to parse the code in one iteration and spit out pairs of names and sites, and then filter to throw out the pairs.
As your code is right now the lists aren't even the same length before you try to filter them by '/your-project' because there are advertising links that match 'li.website-link.wibise-link-a > a'

My python code looks up only last value from the loop

I have a file called IP2.txt and this file contains 2 rows as shown below.
103.201.150.209
113.170.129.113
My code goes like this it reads the file IP2 and looks up the website search
import requests
from bs4 import BeautifulSoup
import re
fh = open('IP2.txt')
for line in fh:
ip = line.rstrip()
print(ip)
loads = {'q':line.rstrip(),'engine':1}
r = requests.get('https://fortiguard.com/search',params=loads)
# print(r.url)
# print(r.text)
Link_text = r.text
soup = BeautifulSoup(Link_text, 'lxml')
for product in soup.find_all('section', class_='iprep'):
product_title = product.find("a").text
print(ip+':'+ product_title)
fh.close()
The output of the above code is like this.
103.201.150.209
113.170.129.113
113.170.129.113:Malicious Websites
As you can see it's reading the last line and skipping the first value: 103.201.150.209
It seems like your indentation is not correct, causing lines that should be part of your loops to be executed only once after those loops are over. You are probably looking for this:
with open('IP2.txt') as fh:
for line in fh:
ip = line.rstrip()
print(ip)
loads = {'q':line.rstrip(), 'engine':1}
r = requests.get('https://fortiguard.com/search', params=loads)
# do the following for ALL ips
soup = BeautifulSoup(r.text, 'lxml')
for product in soup.find_all('section', class_='iprep'):
product_title = product.find("a").text
# print ALL products
print(ip + ':' + product_title)
Also note the use of with which will auto-close your file even if something goes wrong in between.
You are overriding r value every time in your for loop. You can create a list outside of your loop and append to it every time in loop. Other way would be to do all your BeautifulSoup operations and printing inside your for loop, then you will be getting your printout for every r.
I think you need to loop over the return values from requests:
import requests
from bs4 import BeautifulSoup
import re
with open('IP2.txt') as fh:
texts = []
for line in fh:
ip = line.rstrip()
print(ip)
loads = {'q':line.rstrip(),'engine':1}
r = requests.get('https://fortiguard.com/search',params=loads)
texts.append(r.text)
for text in texts:
soup = BeautifulSoup(text, 'lxml')
for product in soup.find_all('section', class_='iprep'):
product_title = product.find("a").text
print(ip+':'+ product_title)
I think what you need is :
import requests
from bs4 import BeautifulSoup
import re
fh = open('IP2.txt')
for line in fh:
ip = line.rstrip()
print(ip)
loads = {'q':line.rstrip(),'engine':1}
r = requests.get('https://fortiguard.com/search',params=loads)
Link_text = r.text
soup = BeautifulSoup(Link_text, 'lxml')
for product in soup.find_all('section', class_='iprep'):
product_title = product.find("a").text
print(ip+':'+ product_title)
fh.close()
Seems more like an indentation problem.

How to get all application link from the log text file?

I have a log file which contains:
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/
http://www.downloadray.com/windows/Photos_and_Images/Graphic_Capture/
http://www.downloadray.com/windows/Photos_and_Images/Digital_Photo_Tools/
I have this code:
from bs4 import BeautifulSoup
import urllib
import urlparse
f = open("downloadray2.txt")
g = open("downloadray3.txt", "w")
for line in f.readlines():
i = 1
while 1:
url = line+"?page=%d" % i
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
has_more = 1
for a in soup.select("div.n_head2 a[href]"):
try:
print (a["href"])
g.write(a["href"]+"\n")
except:
print "no link"
if has_more:
i += 1
else:
break
This code do not give error but it do not working.
I tried modified it but can't solved it.
But when I try this code,it works well:
from bs4 import BeautifulSoup
import urllib
import urlparse
g = open("downloadray3.txt", "w")
url = "http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/"
pageUrl = urllib.urlopen(url)
soup = BeautifulSoup(pageUrl)
i = 1
while 1:
url1 = url+"?page=%d" % i
pageHtml = urllib.urlopen(url1)
soup = BeautifulSoup(pageHtml)
has_more = 2
for a in soup.select("div.n_head2 a[href]"):
try:
print (a["href"])
g.write(a["href"]+"\n")
except:
print "no link"
if has_more:
i += 1
else:
break
So how can I make it can read from the log text file. It is hard to take link one by one to be read.
Have you stripped the newline from the end of the line?
for line in f.readlines():
line = line.strip()
readlines() will produce a list of lines taken from the file including the newline \n character.
Proof Evidence by printing url variable (after the line url = line+"?page=%d" % i):
Your original code:
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/
?page=1
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/
?page=2
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/
?page=3
With my suggested fix:
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/?page=1
http://www.downloadray.com/TIFF-to-JPG_download/
http://www.downloadray.com/Moo0-Image-Thumbnailer_download/
http://www.downloadray.com/Moo0-Image-Sizer_download/
http://www.downloadray.com/Advanced-Image-Viewer-and-Converter_download/
http://www.downloadray.com/GandMIC_download/
http://www.downloadray.com/SendTo-Convert_download/
http://www.downloadray.com/PNG-To-JPG-Converter-Software_download/
http://www.downloadray.com/Graphics-Converter-Pro_download/
http://www.downloadray.com/PICtoC_download/
http://www.downloadray.com/Free-Images-Converter_download/
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/?page=2
http://www.downloadray.com/VarieDrop_download/
http://www.downloadray.com/Tinuous_download/
http://www.downloadray.com/Acme-CAD-Converter_download/
http://www.downloadray.com/AAOImageConverterandFTP_download/
http://www.downloadray.com/ImageCool-Converter_download/
http://www.downloadray.com/GeoJpeg_download/
http://www.downloadray.com/Android-Resizer-Tool_download/
http://www.downloadray.com/Scarab-Darkroom_download/
http://www.downloadray.com/Jpeg-Resizer_download/
http://www.downloadray.com/TIFF2PDF_download/
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/?page=3
http://www.downloadray.com/JGraphite_download/
http://www.downloadray.com/Easy-PNG-to-Icon-Converter_download/
http://www.downloadray.com/JBatch-It!_download/
http://www.downloadray.com/Batch-It!-Pro_download/
http://www.downloadray.com/Batch-It!-Ultra_download/
http://www.downloadray.com/Image-to-Ico-Converter_download/
http://www.downloadray.com/PSD-To-PNG-Converter-Software_download/
http://www.downloadray.com/VectorNow_download/
http://www.downloadray.com/KeitiklImages_download/
http://www.downloadray.com/STOIK-Smart-Resizer_download/
Update:
Then again, this code won't run as expected, because the while loop will never continue since the has_more variable is never changed.
You know that you don't have more links when the list returned by `soup.select(...)` is empty. You can check for emptiness using `len(...)`. So that part might go like this:
list_of_links = soup.select("div.n_head2 a[href]")
if len(list_of_links)==0:
break
else:
for a in soup.select("div.n_head2 a[href]"):
print (a["href"])
g.write(a["href"]+"\n")
i += 1
Apparently the page still display the latest page available if it's queried beyond the maximum page. So if the maximum page number available is 82 and you query page 83, it will give page 82. To detect this case, you can save the list of previous page urls, and compare it with current list of urls.
Here is the full code (tested):
from bs4 import BeautifulSoup
import urllib
import urlparse
f = open("downloadray2.txt")
g = open("downloadray3.txt", "w")
for line in f.readlines():
line = line.strip()
i = 1
prev_urls = []
while 1:
url = line+"?page=%d" % i
print 'Examining %s' % url
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
list_of_urls = soup.select("div.n_head2 a[href]")
if set(prev_urls)==set(list_of_urls):
break
else:
for a in soup.select("div.n_head2 a[href]"):
print (a["href"])
g.write(a["href"]+"\n")
i += 1
prev_urls = list_of_urls

Categories

Resources