IDE output shows line breaks but txt output file does not. What am i missing?
from bs4 import BeautifulSoup
import requests
source = requests.get('https://dota2.gamepedia.com/Category:Counters').text
soup = BeautifulSoup(source, 'lxml')
link = soup.find('div', class_="mw-category")
heroes_names = []
savefile = open('file.txt', 'w')
for link in link:
link = link.text
heroes = link.split("\n"
for i in range(1,len(heroes)):
heroname = heroes[i].split("/")[0]
print(heroname)
heroes_names.append(heroname)
savefile.write(heroname)
# for hero_name in heroes_names:
# print(hero_name)
savefile.close()
required output to txt file (without the bullets) :
Abaddon
Alchemist
Ancient Apparition
Anti-Mage
Arc Warden
Axe
Bane
actual output to txt file :
AbaddonAlchemistAncient ApparitionAnti-MageArc WardenAxeBane
Instead of
savefile.write(heroname)
do
savefile.writeline(heroname + "\n")
which will add a linefeed character to the end of your output.
First of all, dont write in file each time in loop. Do it in your below code commented
Also dont open file like you do
with open("file.txt", "a+") as f:
for hero_name in heroes_names:
print("Will write: %s" %hero_name)
f.write("%s\n" %hero_name)
Related
I want to do a search using keywords from a file in a loop. using Selenium and BeatifulSoup
read 1st. row, put the value of it (one keyword) into the search query area, and search, when done, use the 2nd row from the file, and so on.
the read file part does print all keywords, one on each row, but I am not sure how to put it into the search query area, one at a time.
def SearchFuncs():
driver.get('https://www.website.com/search/?q=pet%20care') #put the value from one row on search/?q=
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
a = soup.select('div.class_name a')
for a in soup.select('div.class_name a'):
#print(a['title'])
return a
#SearchFuncs()
x = SearchFuncs()
print(x ['title'])
# read file sction:
with open ("kw-to-search.txt", "r") as f:
for line in f:
print(line.strip())
Updated: I also added save the result to file
but I tested the codes without save to file section
this is the code I tried using one of the solution (broderick) provided, thank you broderick, I don't get any output, and neither any error:
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import time
def SearchFuncs(addr):
driver.get(addr)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
a = soup.select('div.class_name a')
for a in soup.select('div.class_name a'):
#return a
#print(a ['title'])
with open ("kw.txt", "r") as f:
for line in f:
addr_to_search = 'https://www.website.com/search/?q='
# Build search query from lines
pieces = line.split()
query = ''
for i in range(len(pieces) - 1):
query += (pieces[i] + '%20')
query += pieces[-1]
# Debugging print
print(query)
addr_to_search += query
SearchFuncs(addr_to_search)
textList = a['title']
outF = open("keyword_result.txt", 'a')
for line in textList:
# write line to output file
outF.write(line)
#outF.write("\n")
outF.write(textList + '\n')
outF.close()
Updated with another code
This is another variation Arthur Pereira provided, thank you, Arthur Pereira
def SearchFuncs(url):
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
a = soup.select('div.class_name a')
for a in soup.select('div.class_name a'):
return a
#y = SearchFuncs(url)
#print(y ['title'])
#print(a['title'])
textList = a['title']
outF = open("Keyword_results-2.txt", 'a')
for line in textList:
# write line to output file
outF.write(line)
#outF.write("\n")
outF.write(textList + '\n')
outF.close()
with open("kw.txt", "r") as f:
for line in f:
query = line.strip().replace(" ", "%20")
url = "https://www.website.com/search/?q=" + query
SearchFuncs(url)
Error:
Traceback (most recent call last):
File "c:/Users/mycomp/Desktop/Python/test/Test-Search-on-Pin-fromList-1.py", line 45, in <module>
SearchFuncs(url)
File "c:/Users/mycomp/Desktop/Python/test/Test-Search-on-Pin-fromList-1.py", line 31, in SearchFuncs
textList = a['title']
TypeError: list indices must be integers or slices, not str
Iterate over each line in your text and prepare it to search. Then pass this url to your search function as a parameter:
Also I think you misuderstand the concept of return. Here your code is just returning the first a element and nothing should happen after it, leaving the function.
for a in soup.select('div.Eqh.F6l.Jea.k1A.zI7.iyn.Hsu a'):
return a
The error you are getting is beacuse it's not finding anything with your select, so it tries to create a list with a string as index:
textList = a['title']
So, assuming you want to get the text inside each anchor element you have to find the correct div and jup into the a element. Then you can get the title and write to a file.
def SearchFuncs(url):
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
collection = soup.select('div.Collection-Item a')
for item in collection:
title = item['title'].strip()
with open("Keyword_results-2.txt", 'a', encoding="utf-8") as outF:
outF.write(title + '\n') # write line to output file
with open("kw.txt", "r") as f:
for line in f:
query = line.strip().replace(" ", "%20")
url = "https://www.pinterest.com/search/pins/?q=" + query
SearchFuncs(url)
Try
def SearchFuncs(addr):
driver.get(addr)
...
and
with open ("kw-to-search.txt", "r") as f:
for line in f:
addr_to_search = 'https://www.website.com/search/?q='
# Build search query from lines
pieces = line.split()
query = ''
for i in range(len(pieces) - 1):
query += (pieces[i] + '%20')
query += pieces[-1]
# Debugging print
print(query)
addr_to_search += query
SearchFuncs(addr_to_search)
I have a beautiful soup program where I find all the links on a webpage and put it in a queue.txt file. The program then gets each link from the file and find all the links on those links. They then get put into a crawled.txt file for all the crawled links.
I want to make sure I get no duplicates so I want the program to go through the queue.txt and crawled.txt and if the links that have just been found are in those files, then the new found links shouldn't be put in the file
I have tried doing it so that it prints the newly found links into a list and removes duplicates from there and prints the list to a .txt file but it wouldn't be able to tell what is in the queue file it only removes duplicates from the newly found links from the one page.
This is the code:
from bs4 import BeautifulSoup
import requests
import re
from urllib.parse import urlparse
def get_links(base_url, file_name):
page = requests.get(base_url)
soup = BeautifulSoup(page.content, 'html.parser')
single_slash = re.compile(r'^/\w')
double_slash = re.compile(r'^//\w')
parsed_uri = urlparse(base_url)
domain_name = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri)
with open(file_name, "a") as f:
for tag in soup.find_all('a'):
link = str(tag.get('href'))
if str(link).startswith("http"):
link = link
print(link)
if double_slash.match(link):
link = 'https:' + link
print(link)
if single_slash.match(link):
link = domain_name + link
print(link)
if str(link).startswith("#"):
continue
if str(link).startswith("j"):
continue
if str(link).startswith('q'):
continue
if str(link).startswith('u'):
continue
if str(link).startswith('N'):
continue
if str(link).startswith('m'):
continue
try:
f.write(link + '\n')
except:
pass
get_links('https://stackabuse.com/reading-and-writing-lists-to-a-file-in-python/', "queue.txt")
with open('queue.txt') as f:
lines = f.read().splitlines()
print(lines)
for link in lines:
if lines[0] == "/":
del lines[0]
print(lines[0])
with open('crawled.txt', 'a') as h:
h.write('%s\n' % lines[0])
h.close()
del lines[0]
if lines[0] == "/":
del lines[0]
with open('queue.txt', 'w') as filehandle:
for listitem in lines:
filehandle.write('%s\n' % listitem)
page_url = lines[0]
get_links(page_url, "queue.txt")
print(lines)
with open('queue.txt') as f:
lines = f.read().splitlines()
In general for Python, when trying to remove duplicates, sets are usually a good bet. For example:
lines = open('queue.txt', 'r').readlines()
queue_set = set(lines)
result = open('queue.txt', 'w')
for line in queue_set:
result.write(line)
Note: This will not preserve the order of the links, but I don't see a reason for that in this case.
Also, this was adapted from this answer.
I have a file called IP2.txt and this file contains 2 rows as shown below.
103.201.150.209
113.170.129.113
My code goes like this it reads the file IP2 and looks up the website search
import requests
from bs4 import BeautifulSoup
import re
fh = open('IP2.txt')
for line in fh:
ip = line.rstrip()
print(ip)
loads = {'q':line.rstrip(),'engine':1}
r = requests.get('https://fortiguard.com/search',params=loads)
# print(r.url)
# print(r.text)
Link_text = r.text
soup = BeautifulSoup(Link_text, 'lxml')
for product in soup.find_all('section', class_='iprep'):
product_title = product.find("a").text
print(ip+':'+ product_title)
fh.close()
The output of the above code is like this.
103.201.150.209
113.170.129.113
113.170.129.113:Malicious Websites
As you can see it's reading the last line and skipping the first value: 103.201.150.209
It seems like your indentation is not correct, causing lines that should be part of your loops to be executed only once after those loops are over. You are probably looking for this:
with open('IP2.txt') as fh:
for line in fh:
ip = line.rstrip()
print(ip)
loads = {'q':line.rstrip(), 'engine':1}
r = requests.get('https://fortiguard.com/search', params=loads)
# do the following for ALL ips
soup = BeautifulSoup(r.text, 'lxml')
for product in soup.find_all('section', class_='iprep'):
product_title = product.find("a").text
# print ALL products
print(ip + ':' + product_title)
Also note the use of with which will auto-close your file even if something goes wrong in between.
You are overriding r value every time in your for loop. You can create a list outside of your loop and append to it every time in loop. Other way would be to do all your BeautifulSoup operations and printing inside your for loop, then you will be getting your printout for every r.
I think you need to loop over the return values from requests:
import requests
from bs4 import BeautifulSoup
import re
with open('IP2.txt') as fh:
texts = []
for line in fh:
ip = line.rstrip()
print(ip)
loads = {'q':line.rstrip(),'engine':1}
r = requests.get('https://fortiguard.com/search',params=loads)
texts.append(r.text)
for text in texts:
soup = BeautifulSoup(text, 'lxml')
for product in soup.find_all('section', class_='iprep'):
product_title = product.find("a").text
print(ip+':'+ product_title)
I think what you need is :
import requests
from bs4 import BeautifulSoup
import re
fh = open('IP2.txt')
for line in fh:
ip = line.rstrip()
print(ip)
loads = {'q':line.rstrip(),'engine':1}
r = requests.get('https://fortiguard.com/search',params=loads)
Link_text = r.text
soup = BeautifulSoup(Link_text, 'lxml')
for product in soup.find_all('section', class_='iprep'):
product_title = product.find("a").text
print(ip+':'+ product_title)
fh.close()
Seems more like an indentation problem.
I have a bunch of web text that I'd like to scrape and export to a csv file. The problem is that the text is split over multiple lines on the website and that's how beautifulsoup reads it. When I export to csv, all the text goes into one cell but the cell has multiple lines of text. When I try to read the csv into another program, it interprets the multiple lines in a way that yields a nonsensical dataset. The question is, how do I put all the text into a single line after I pull it with beautifulsoup but before I export to csv?
Here's a simple working example demonstrating the problem of multiple lines (in fact, the first few lines in the resulting csv are blank, so at first glance it may look empty):
import csv
import requests
from bs4 import BeautifulSoup
def main():
r = requests.get("https://www.econometricsociety.org/publications/econometrica/2017/03/01/search-yield")
soup = BeautifulSoup(r.text,"html.parser")
with open('Temp.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.writer(f,delimiter=",")
abstract=soup.find("article").text
writer.writerow([abstract])
if __name__ == '__main__':
main()
UPDATE: there have been some good suggestions, but it's still not working. The following code still produces a csv file with line breaks in a cell:
import csv
import requests
from bs4 import BeautifulSoup
with open('Temp.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.writer(f,delimiter=',')
r = requests.get("https://www.econometricsociety.org/publications/econometrica/2017/03/01/search-yield")
soup = BeautifulSoup(r.text,'lxml')
find_article = soup.find('article')
find_2para = find_article.p.find_next_sibling("p")
find_largetxt = find_article.p.find_next_sibling("p").nextSibling
writer.writerow([find_2para,find_largetxt])
Here's another attempt based on a different suggestion. This one also ends up producing a line break in the csv file:
import csv
import requests
from bs4 import BeautifulSoup
def main():
r = requests.get("https://www.econometricsociety.org/publications/econometrica/2017/03/01/search-yield")
soup = BeautifulSoup(r.text,"html.parser")
with open('Temp.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.writer(f,delimiter=",")
abstract=soup.find("article").get_text(separator=" ", strip=True)
writer.writerow([abstract])
if __name__ == '__main__':
main()
Change your abstract = ... line into:
abstract = soup.find("article").get_text(separator=" ", strip=True)
It'll separate each line using the separator parameter (in this case It'll separate the strings with an empty space.
The solution that ended up working for me is pretty simple:
abstract=soup.find("article").text.replace("\t", "").replace("\r", "").replace("\n", "")
That gets rid of all line breaks.
r = requests.get("https://www.econometricsociety.org/publications/econometrica/2017/03/01/search-yield")
soup = BeautifulSoup(r.text,'lxml') # I prefer using xml parser
find_article = soup.find('article')
# Next line how to find The title in this case: Econometrica: Mar 2017, Volume 85, Issue 2
find_title = find_article.h3
# find search yeild
find_yeild = find_article.h1
#first_paragraph example : DOI: 10.3982/ECTA14057 p. 351-378
find_1para = find_article.p
#second p example : David MartinezāMiera, Rafael Repullo
find_2para = find_article.p.find_next_sibling("p")
#find the large text area using e.g. 'We present a model of the relationship bet...'
find_largetxt = find_article.p.find_next_sibling("p").nextSibling
I used a variety of methods of getting to the text area you wish just for the purpose of education(you can use .text on each of these to get the text without tags or you can use Zroq's method.
But you can write each one of these into the file by doing for example
writer.writerow(find_title.text)
While coding an image downloader I was testing some functions and I want to store all the links in links.txt but it only write one link and in the run window it shows all the links that has been founded please help me to fix this problem, and also if the problem fixed I want to know how many links are there, I tried some function but it doesn't seem to work like I wanted to and thank you so much!
Here's the code so far:
# import random
# import urllib.request
import requests
from bs4 import BeautifulSoup
def Download_Image_from_Web(url):
# name = random.randrange(1, 1000)
# fullName = str(name) + ".jpg"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('img'):
image_links = link.get('src')
if '.jpg' in image_links:
raw_text = r'links.txt'
fw = open(raw_text, 'w')
for i in image_links.split("\\n"):
fw.write(i+'\n')
fw.close()
fr = open('links.txt', 'r')
text = fr.read()
print(text)
Download_Image_from_Web("https://pixabay.com/")
Below is the program based on original sample using with context for reference.
import requests
from bs4 import BeautifulSoup
def Download_Image_from_Web(url):
# name = random.randrange(1, 1000)
# fullName = str(name) + ".jpg"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
raw_text = 'links.txt'
with open(raw_text, 'w') as fw:
for link in soup.findAll('img'):
image_links = link.get('src')
if '.jpg' in image_links:
for i in image_links.split("\\n"):
fw.write(i+'\n')
with open(raw_text, 'r') as fr:
text = fr.read()
# print(text)
print("Total {} images found:\n{}".format(len(text.splitlines()), text))
Download_Image_from_Web("https://pixabay.com/")
EDIT1: Remove the description on previous with sample in first paragraph.
EDIT2: Add image counts in output.
You have to open the file to write in append mode note in write mode.
Basically write mode overwrites the file so your code will be
fw = open(raw_text, 'a') #this opens file in append mode
for i in image_links.split("\\n"):
fw.write(i+'\n')
fw.close()
fr = open('links.txt', 'r')
text = fr.read() #this prints all the written content for each line
print(text)
You can print the entire written content at the end like this
fw = open(raw_text, 'a') #this opens file in append mode
for i in image_links.split("\\n"):
fw.write(i+'\n')
fw.close()
fr = open('links.txt', 'r')
text = fr.read() #this prints all the written content at the end.
print(text)
If you need count of images then you can use len method on image_links.split("\\n") so in that case your code will be
fw = open(raw_text, 'a') #this opens file in append mode
Images= image_links.split("\\n")
print "no of images = ",len(Images)
for i in Images:
fw.write(i+'\n')
fw.close()
fr = open('links.txt', 'r')
text = fr.read() #this prints all the written content at the end
print(text)