How to pass the value of a loop into a function? - python

I want to do a search using keywords from a file in a loop. using Selenium and BeatifulSoup
read 1st. row, put the value of it (one keyword) into the search query area, and search, when done, use the 2nd row from the file, and so on.
the read file part does print all keywords, one on each row, but I am not sure how to put it into the search query area, one at a time.
def SearchFuncs():
driver.get('https://www.website.com/search/?q=pet%20care') #put the value from one row on search/?q=
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
a = soup.select('div.class_name a')
for a in soup.select('div.class_name a'):
#print(a['title'])
return a
#SearchFuncs()
x = SearchFuncs()
print(x ['title'])
# read file sction:
with open ("kw-to-search.txt", "r") as f:
for line in f:
print(line.strip())
Updated: I also added save the result to file
but I tested the codes without save to file section
this is the code I tried using one of the solution (broderick) provided, thank you broderick, I don't get any output, and neither any error:
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import time
def SearchFuncs(addr):
driver.get(addr)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
a = soup.select('div.class_name a')
for a in soup.select('div.class_name a'):
#return a
#print(a ['title'])
with open ("kw.txt", "r") as f:
for line in f:
addr_to_search = 'https://www.website.com/search/?q='
# Build search query from lines
pieces = line.split()
query = ''
for i in range(len(pieces) - 1):
query += (pieces[i] + '%20')
query += pieces[-1]
# Debugging print
print(query)
addr_to_search += query
SearchFuncs(addr_to_search)
textList = a['title']
outF = open("keyword_result.txt", 'a')
for line in textList:
# write line to output file
outF.write(line)
#outF.write("\n")
outF.write(textList + '\n')
outF.close()
Updated with another code
This is another variation Arthur Pereira provided, thank you, Arthur Pereira
def SearchFuncs(url):
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
a = soup.select('div.class_name a')
for a in soup.select('div.class_name a'):
return a
#y = SearchFuncs(url)
#print(y ['title'])
#print(a['title'])
textList = a['title']
outF = open("Keyword_results-2.txt", 'a')
for line in textList:
# write line to output file
outF.write(line)
#outF.write("\n")
outF.write(textList + '\n')
outF.close()
with open("kw.txt", "r") as f:
for line in f:
query = line.strip().replace(" ", "%20")
url = "https://www.website.com/search/?q=" + query
SearchFuncs(url)
Error:
Traceback (most recent call last):
File "c:/Users/mycomp/Desktop/Python/test/Test-Search-on-Pin-fromList-1.py", line 45, in <module>
SearchFuncs(url)
File "c:/Users/mycomp/Desktop/Python/test/Test-Search-on-Pin-fromList-1.py", line 31, in SearchFuncs
textList = a['title']
TypeError: list indices must be integers or slices, not str

Iterate over each line in your text and prepare it to search. Then pass this url to your search function as a parameter:
Also I think you misuderstand the concept of return. Here your code is just returning the first a element and nothing should happen after it, leaving the function.
for a in soup.select('div.Eqh.F6l.Jea.k1A.zI7.iyn.Hsu a'):
return a
The error you are getting is beacuse it's not finding anything with your select, so it tries to create a list with a string as index:
textList = a['title']
So, assuming you want to get the text inside each anchor element you have to find the correct div and jup into the a element. Then you can get the title and write to a file.
def SearchFuncs(url):
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
collection = soup.select('div.Collection-Item a')
for item in collection:
title = item['title'].strip()
with open("Keyword_results-2.txt", 'a', encoding="utf-8") as outF:
outF.write(title + '\n') # write line to output file
with open("kw.txt", "r") as f:
for line in f:
query = line.strip().replace(" ", "%20")
url = "https://www.pinterest.com/search/pins/?q=" + query
SearchFuncs(url)

Try
def SearchFuncs(addr):
driver.get(addr)
...
and
with open ("kw-to-search.txt", "r") as f:
for line in f:
addr_to_search = 'https://www.website.com/search/?q='
# Build search query from lines
pieces = line.split()
query = ''
for i in range(len(pieces) - 1):
query += (pieces[i] + '%20')
query += pieces[-1]
# Debugging print
print(query)
addr_to_search += query
SearchFuncs(addr_to_search)

Related

Python Properly replace some content from Variable

I'm trying to scraping using Yahoo Engine. Using keyword like "python".
I have wrote this little program :
query = "python"
url = {"https://fr.search.yahoo.com/search?p=&fr=yfp-search-sb",
"https://fr.search.yahoo.com/search?p=&fr=yfp-search-sb&b=11&pz=10&pstart=5"}
def checker():
for yahoo in url:
yahooo = yahoo.replace("&fr",query + "&fr")
r = requests.get(yahooo)
soup = bs(r.text, 'html.parser')
links = soup.find_all('a')
for link in soup.find_all('a'):
a = link.get('href')
unquote(a)
print("Urls : " + a)
with open("Yahoo.txt", mode="a",encoding="utf-8") as fullz:
fullz.write(a + "\n")
fullz.close()
lines_seen = set() # holds lines already seen
outfile = open("Yahoonodup.txt", "w", encoding="utf-8")
for line in open("Yahoo.txt", "r", encoding="utf-8"):
if line not in lines_seen: # not a duplicate
outfile.write(line)
lines_seen.add(line)
outfile.close()
checker()
My output file contains some urls like this :
https://r.search.yahoo.com/cbclk2/dWU9MURCNjczQ0UwNThBNDk4MyZ1dD0xNjE2ODAzMTA5MDE4JnVvPTg0OTM3NTA2NTgyMzY5Jmx0PTImcz0xJmVzPVdHbFZxQzRHUFNfemNveGNLaUgxVkpoX3lXV2N2WFhiQkRfZklRLS0-/RV=2/RE=1616831909/RO=10/RU=https%3a%2f%2fwww.bing.com%2faclick%3fld%3de8BWTO-5A13W9y2D2Aw39AjjVUCUyb98EJf6bSa7R7dGxGXelKfNh7KW94OonXABpN7Bo9YkZqB22Evk3cfTIpJi3aGEXXKJMtDqnaNUDUVcsehzFOYyr09GoYqUE-iUywRWeOnV4aeACKf4_YX6dE2BVZAbqkvWj4HQMqeB_Fl1KlwT1v%26u%3daHR0cHMlM2ElMmYlMmZ2ZXJnbGVpY2guZm9jdXMuZGUlMmZ3YXNjaG1hc2NoaW5lJTJmJTNmY2hhbm5lbCUzZGJpbmclMjZkZXZpY2UlM2RjJTI2bmV0d29yayUzZG8lMjZjYW1wYWlnbiUzZDQwNzE4NzU1MCUyNmFkZ3JvdXAlM2QxMzU4OTk2OTA3NDAxNDE4JTI2dGFyZ2V0JTNka3dkLTg0OTM3NjAxMjIzNjUyJTNhbG9jLTcyJTI2YWQlM2Q4NDkzNzUwNjU4MjM2OSUyNmFkLWV4dGVuc2lvbiUzZA%26rlid%3d0fc40f09a4b6109e9c726f57d193ec0e/RK=2/RS=3w4U9AT_OQyaVSF.6KLwzWuo_LU-;_ylc=cnQDMQ--?IG=0ac9439bcf3f4ec087000000005bf464
And I want to change it into the real links :
https://vergleich.focus.de/waschmaschine/?channel=bing&device=c&network=o&campaign=407187550&adgroup=1358996907401418&target=kwd-84937601223652:loc-72&ad=84937506582369&ad-extension=
It's possible ?
As seen here the response will return the url of the site that was responsible of returning the content. Meaning that for your example, you can do something like this.
url = 'https://r.search.yahoo.com/cbclk2/dWU9MURCNjczQ0UwNThBNDk4MyZ1dD0xNjE2ODAzMTA5MDE4JnVvPTg0OTM3NTA2NTgyMzY5Jmx0PTImcz0xJmVzPVdHbFZxQzRHUFNfemNveGNLaUgxVkpoX3lXV2N2WFhiQkRfZklRLS0-/RV=2/RE=1616831909/RO=10/RU=https%3a%2f%2fwww.bing.com%2faclick%3fld%3de8BWTO-5A13W9y2D2Aw39AjjVUCUyb98EJf6bSa7R7dGxGXelKfNh7KW94OonXABpN7Bo9YkZqB22Evk3cfTIpJi3aGEXXKJMtDqnaNUDUVcsehzFOYyr09GoYqUE-iUywRWeOnV4aeACKf4_YX6dE2BVZAbqkvWj4HQMqeB_Fl1KlwT1v%26u%3daHR0cHMlM2ElMmYlMmZ2ZXJnbGVpY2guZm9jdXMuZGUlMmZ3YXNjaG1hc2NoaW5lJTJmJTNmY2hhbm5lbCUzZGJpbmclMjZkZXZpY2UlM2RjJTI2bmV0d29yayUzZG8lMjZjYW1wYWlnbiUzZDQwNzE4NzU1MCUyNmFkZ3JvdXAlM2QxMzU4OTk2OTA3NDAxNDE4JTI2dGFyZ2V0JTNka3dkLTg0OTM3NjAxMjIzNjUyJTNhbG9jLTcyJTI2YWQlM2Q4NDkzNzUwNjU4MjM2OSUyNmFkLWV4dGVuc2lvbiUzZA%26rlid%3d0fc40f09a4b6109e9c726f57d193ec0e/RK=2/RS=3w4U9AT_OQyaVSF.6KLwzWuo_LU-;_ylc=cnQDMQ--?IG=0ac9439bcf3f4ec087000000005bf464'
response = requests.get(url)
print(response.url) ## this will give you 'https://vergleich.focus.de/waschmaschine/?channel=bing&device=c&network=o&campaign=407187550&adgroup=1358996907401418&target=kwd-84937601223652:loc-72&ad=84937506582369&ad-extension='

I would like to find if the new found links from Beautiful soup is already in the queue.txt file and crawled.txt file

I have a beautiful soup program where I find all the links on a webpage and put it in a queue.txt file. The program then gets each link from the file and find all the links on those links. They then get put into a crawled.txt file for all the crawled links.
I want to make sure I get no duplicates so I want the program to go through the queue.txt and crawled.txt and if the links that have just been found are in those files, then the new found links shouldn't be put in the file
I have tried doing it so that it prints the newly found links into a list and removes duplicates from there and prints the list to a .txt file but it wouldn't be able to tell what is in the queue file it only removes duplicates from the newly found links from the one page.
This is the code:
from bs4 import BeautifulSoup
import requests
import re
from urllib.parse import urlparse
def get_links(base_url, file_name):
page = requests.get(base_url)
soup = BeautifulSoup(page.content, 'html.parser')
single_slash = re.compile(r'^/\w')
double_slash = re.compile(r'^//\w')
parsed_uri = urlparse(base_url)
domain_name = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri)
with open(file_name, "a") as f:
for tag in soup.find_all('a'):
link = str(tag.get('href'))
if str(link).startswith("http"):
link = link
print(link)
if double_slash.match(link):
link = 'https:' + link
print(link)
if single_slash.match(link):
link = domain_name + link
print(link)
if str(link).startswith("#"):
continue
if str(link).startswith("j"):
continue
if str(link).startswith('q'):
continue
if str(link).startswith('u'):
continue
if str(link).startswith('N'):
continue
if str(link).startswith('m'):
continue
try:
f.write(link + '\n')
except:
pass
get_links('https://stackabuse.com/reading-and-writing-lists-to-a-file-in-python/', "queue.txt")
with open('queue.txt') as f:
lines = f.read().splitlines()
print(lines)
for link in lines:
if lines[0] == "/":
del lines[0]
print(lines[0])
with open('crawled.txt', 'a') as h:
h.write('%s\n' % lines[0])
h.close()
del lines[0]
if lines[0] == "/":
del lines[0]
with open('queue.txt', 'w') as filehandle:
for listitem in lines:
filehandle.write('%s\n' % listitem)
page_url = lines[0]
get_links(page_url, "queue.txt")
print(lines)
with open('queue.txt') as f:
lines = f.read().splitlines()
In general for Python, when trying to remove duplicates, sets are usually a good bet. For example:
lines = open('queue.txt', 'r').readlines()
queue_set = set(lines)
result = open('queue.txt', 'w')
for line in queue_set:
result.write(line)
Note: This will not preserve the order of the links, but I don't see a reason for that in this case.
Also, this was adapted from this answer.

Can't write in csv file

When I try to write the information in the csv file, error is thrown:
Traceback (most recent call last):
File "sizeer.py", line 68, in <module>
writer.writerow([name,color,price])
ValueError: I/O operation on closed file
import requests
import csv
from bs4 import BeautifulSoup
proxies = {
"http":"http://195.189.60.97:3128",
"http":"http://103.78.75.165:8080",
"http":"http://212.87.220.2:3128",
"http":"http://88.99.134.61:8080",
"http":"http://103.102.139.178:8080",
"http":"http://218.60.8.83:3129",
"http":"http://124.121.105.193:8888",
"http":"http://198.237.114.54:8080",
"http":"http://36.67.106.58:8080",
"http":"http://35.214.241.28:3128"
}
base_url = ...
page = requests.get(base_url, proxies=proxies)
if page.status_code != 200:
exit("Page wasn't parsed")
soup = BeautifulSoup(page.content, 'lxml')
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
#Get categories
category_wrapper = soup.find_all(class_="m-menu_subItem")
categories = []
for cw in category_wrapper:
anchor = cw.find("a", recursive=False)
categories.append(anchor['href'])
#Iterrate categories
for category in categories:
cat_page = requests.get(base_url + category, proxies=proxies)
cat_soup = BeautifulSoup(cat_page.content, 'lxml')
products_wrapper = cat_soup.find(class_="b-productList")
cat_pagination = products_wrapper.find(class_="m-pagination").find_all("span")
max_page = [int(s) for s in cat_pagination[-1].text.split() if s.isdigit()][0]
#Iterrate category with pagination and get products
for i in range(1, max_page+1):
cat_pagination_page = requests.get(base_url+category+"/?sort=default&limit=60&page="+str(i), proxies=proxies)
cat_pagination_page_soup = BeautifulSoup(cat_pagination_page.content, 'lxml')
product_links = cat_pagination_page_soup.find_all(class_="b-itemList_photoLink")
for link in product_links:
#Get product data
product_page = requests.get(base_url+link['href'], proxies=proxies)
product_soup = BeautifulSoup(product_page.content, 'lxml')
#Get product variations
variations = product_soup.find_all(class_="m-productDescr_colorItem")
#If there are variations
if len(variations) > 0:
for v in variations:
variation_page = requests.get(base_url+v['href'], proxies=proxies)
variation_soup = BeautifulSoup(variation_page.content, 'lxml')
price = variation_soup.find(class_="s-newPrice").text.strip().split(" ")[0]
name = variation_soup.find(class_="m-productDescr_headline").text.strip()
color = v['title']
print(name)
print(color)
print(price)
print("-------------")
#Save in csv
writer.writerow([name,color,price])
print("SCRAPING DONE")
How to keep the file open through the whole script execution ? Or I have to open it every time I am adding content ? EDIT In fact, the file is not even created.
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
The file closes at the end of the with block - that is the block's purpose.
You could put everything inside the block, but that only makes the existing problem worse: the code is reaching several levels of indents, is long and becomes difficult to understand. This is why you use functions to organize the code. For example, if you have the big for loop set in a function:
def do_stuff_with(categories, writer):
for category in categories:
# lots of logic here
# use `writer.writerow` when needed
# Get everything else set up that doesn't need the file, first
categories = ... # do the BeautifulSoup input stuff
# then we can open the file and use the function:
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
do_stuff_with(categories, writer)
Once you have that working, you can probably think of ways to apply the technique further. For example, pull out the innermost logic, for handling the variations for a single product. Or you can have a function to handle the creation of the categories data, and return it.

Adding line breaks to text output file

IDE output shows line breaks but txt output file does not. What am i missing?
from bs4 import BeautifulSoup
import requests
source = requests.get('https://dota2.gamepedia.com/Category:Counters').text
soup = BeautifulSoup(source, 'lxml')
link = soup.find('div', class_="mw-category")
heroes_names = []
savefile = open('file.txt', 'w')
for link in link:
link = link.text
heroes = link.split("\n"
for i in range(1,len(heroes)):
heroname = heroes[i].split("/")[0]
print(heroname)
heroes_names.append(heroname)
savefile.write(heroname)
# for hero_name in heroes_names:
# print(hero_name)
savefile.close()
required output to txt file (without the bullets) :
Abaddon
Alchemist
Ancient Apparition
Anti-Mage
Arc Warden
Axe
Bane
actual output to txt file :
AbaddonAlchemistAncient ApparitionAnti-MageArc WardenAxeBane
Instead of
savefile.write(heroname)
do
savefile.writeline(heroname + "\n")
which will add a linefeed character to the end of your output.
First of all, dont write in file each time in loop. Do it in your below code commented
Also dont open file like you do
with open("file.txt", "a+") as f:
for hero_name in heroes_names:
print("Will write: %s" %hero_name)
f.write("%s\n" %hero_name)

How to get all application link from the log text file?

I have a log file which contains:
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/
http://www.downloadray.com/windows/Photos_and_Images/Graphic_Capture/
http://www.downloadray.com/windows/Photos_and_Images/Digital_Photo_Tools/
I have this code:
from bs4 import BeautifulSoup
import urllib
import urlparse
f = open("downloadray2.txt")
g = open("downloadray3.txt", "w")
for line in f.readlines():
i = 1
while 1:
url = line+"?page=%d" % i
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
has_more = 1
for a in soup.select("div.n_head2 a[href]"):
try:
print (a["href"])
g.write(a["href"]+"\n")
except:
print "no link"
if has_more:
i += 1
else:
break
This code do not give error but it do not working.
I tried modified it but can't solved it.
But when I try this code,it works well:
from bs4 import BeautifulSoup
import urllib
import urlparse
g = open("downloadray3.txt", "w")
url = "http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/"
pageUrl = urllib.urlopen(url)
soup = BeautifulSoup(pageUrl)
i = 1
while 1:
url1 = url+"?page=%d" % i
pageHtml = urllib.urlopen(url1)
soup = BeautifulSoup(pageHtml)
has_more = 2
for a in soup.select("div.n_head2 a[href]"):
try:
print (a["href"])
g.write(a["href"]+"\n")
except:
print "no link"
if has_more:
i += 1
else:
break
So how can I make it can read from the log text file. It is hard to take link one by one to be read.
Have you stripped the newline from the end of the line?
for line in f.readlines():
line = line.strip()
readlines() will produce a list of lines taken from the file including the newline \n character.
Proof Evidence by printing url variable (after the line url = line+"?page=%d" % i):
Your original code:
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/
?page=1
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/
?page=2
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/
?page=3
With my suggested fix:
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/?page=1
http://www.downloadray.com/TIFF-to-JPG_download/
http://www.downloadray.com/Moo0-Image-Thumbnailer_download/
http://www.downloadray.com/Moo0-Image-Sizer_download/
http://www.downloadray.com/Advanced-Image-Viewer-and-Converter_download/
http://www.downloadray.com/GandMIC_download/
http://www.downloadray.com/SendTo-Convert_download/
http://www.downloadray.com/PNG-To-JPG-Converter-Software_download/
http://www.downloadray.com/Graphics-Converter-Pro_download/
http://www.downloadray.com/PICtoC_download/
http://www.downloadray.com/Free-Images-Converter_download/
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/?page=2
http://www.downloadray.com/VarieDrop_download/
http://www.downloadray.com/Tinuous_download/
http://www.downloadray.com/Acme-CAD-Converter_download/
http://www.downloadray.com/AAOImageConverterandFTP_download/
http://www.downloadray.com/ImageCool-Converter_download/
http://www.downloadray.com/GeoJpeg_download/
http://www.downloadray.com/Android-Resizer-Tool_download/
http://www.downloadray.com/Scarab-Darkroom_download/
http://www.downloadray.com/Jpeg-Resizer_download/
http://www.downloadray.com/TIFF2PDF_download/
http://www.downloadray.com/windows/Photos_and_Images/Image_Convertors/?page=3
http://www.downloadray.com/JGraphite_download/
http://www.downloadray.com/Easy-PNG-to-Icon-Converter_download/
http://www.downloadray.com/JBatch-It!_download/
http://www.downloadray.com/Batch-It!-Pro_download/
http://www.downloadray.com/Batch-It!-Ultra_download/
http://www.downloadray.com/Image-to-Ico-Converter_download/
http://www.downloadray.com/PSD-To-PNG-Converter-Software_download/
http://www.downloadray.com/VectorNow_download/
http://www.downloadray.com/KeitiklImages_download/
http://www.downloadray.com/STOIK-Smart-Resizer_download/
Update:
Then again, this code won't run as expected, because the while loop will never continue since the has_more variable is never changed.
You know that you don't have more links when the list returned by `soup.select(...)` is empty. You can check for emptiness using `len(...)`. So that part might go like this:
list_of_links = soup.select("div.n_head2 a[href]")
if len(list_of_links)==0:
break
else:
for a in soup.select("div.n_head2 a[href]"):
print (a["href"])
g.write(a["href"]+"\n")
i += 1
Apparently the page still display the latest page available if it's queried beyond the maximum page. So if the maximum page number available is 82 and you query page 83, it will give page 82. To detect this case, you can save the list of previous page urls, and compare it with current list of urls.
Here is the full code (tested):
from bs4 import BeautifulSoup
import urllib
import urlparse
f = open("downloadray2.txt")
g = open("downloadray3.txt", "w")
for line in f.readlines():
line = line.strip()
i = 1
prev_urls = []
while 1:
url = line+"?page=%d" % i
print 'Examining %s' % url
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
list_of_urls = soup.select("div.n_head2 a[href]")
if set(prev_urls)==set(list_of_urls):
break
else:
for a in soup.select("div.n_head2 a[href]"):
print (a["href"])
g.write(a["href"]+"\n")
i += 1
prev_urls = list_of_urls

Categories

Resources