I want to open a txt file (which contains multiple links) and scrap title using beautifulsoup.
My txt file contains link like this:
https://www.lipsum.com/7845284869/
https://www.lipsum.com/56677788/
https://www.lipsum.com/01127111236/
My code:
import requests as rq
from bs4 import BeautifulSoup as bs
with open('output1.csv', 'w', newline='') as f:
url = open('urls.txt', 'r', encoding='utf8')
request = rq.get(str(url))
soup = bs(request.text, 'html.parser')
title = soup.findAll('title')
pdtitle = {}
for pdtitle in title:
pdtitle.append(pdtitle.text)
f.write(f'{pdtitle}')
I want to open all txt file links and scrap title from the links. The main problem is opening txt file in url variable is not working. How to open a file and save data to csv?
you code isn't working because inside URL is all the URL. you need to run one by one:
import requests as rq
from bs4 import BeautifulSoup as bs
with open(r'urls.txt', 'r') as f:
urls = f.readlines()
with open('output1.csv', 'w', newline='') as f:
for url in urls:
request = rq.get(str(url))
soup = bs(request.text, 'html.parser')
title = soup.findAll('title')
pdtitle = {}
for pdtitle in title:
pdtitle.append(pdtitle.text)
f.write(f'{pdtitle}')
Your urls may not be working because your urls are being read with a return line character: \n. You need to strip the text before putting them in a list.
Also, you are using .find_all('title'), and this will return a list, which is probably not what you are looking for. You probably just want the first title and that's it. In that case, .find('title') would be better. I have provided some possible corrections below.
from bs4 import BeautifulSoup
import requests
filepath = '...'
with open(filepath) as f:
urls = [i.strip() for i in f.readlines()]
titles = []
for url in urls:
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
title = soup.find('title') # Note: will find the FIRST title only
titles.append(title.text) # Grabs the TEXT of the title only, removes HTML
new_csv = open('urls.csv', 'w') # Make sure to prepend with desired location, e.g. 'C:/user/name/urls.csv'
for title in titles:
new_csv.write(title+'\n') # The '\n' ensures a new row is written
new_csv.close()
f.close()
Related
I am trying to scrape a website that I put into a local html file. When I use the find_all() method I can get all the tags' text displayed on the python results. The problem is that I can't get it to display all the text in a .txt file.
from bs4 import BeautifulSoup
def interest_retrieval(filename):
with open(f'{filename}', 'r') as html_file:
content = html_file.read()
soup = BeautifulSoup(content, 'lxml')
interests = soup.find_all('h2')
for interest in interests:
with open ('interest.txt', 'w') as file:
file.write(f'{interest.text}')
print(interest.text)
Python will display all the tags as a text but when I write to the .txt file it only will display the last last tag.
output of txt document
Edit I would also like to do a similar thing but with a docx file. I took Igor's suggested code but changed the parts into what I would need for a docx file. But I'm still having the same issue with the docx file.
from bs4 import BeautifulSoup
import docx
def interest_retrieval(filename):
with open(f'{filename}', 'r') as html_file:
content = html_file.read()
soup = BeautifulSoup(content, 'lxml')
interests = soup.find_all('h2')
with open('interest.txt', 'w') as file:
for interest in interests:
mydoc = docx.Document()
mydoc.add_paragraph(f'{interest.text}')
mydoc.save("C:/Users\satam\PycharmProjects\pythonProject\Web Scraper\list.docx")
print(interest.text)
You reopen the file in write mode in every iteration; this overwrites its previous contents. Either open it just once and place the loop within the with block, or open it with the a mode (a for "append"; open('interest.txt', 'a')).
(The former is likely preferable in this case as it seems there's no reason to keep opening and closing the file again and again while you're continuously writing to it.)
Every iteration rewrites the interest.txt file.
You just need to take the with open... part out of the for loop.
Try replacing this fragment
for interest in interests:
with open ('interest.txt', 'w') as file:
file.write(f'{interest.text}')
print(interest.text)
with the following code:
with open('interest.txt', 'w') as file:
for interest in interests:
file.write(f'{interest.text}')
print(interest.text)
Here is the complete code:
from bs4 import BeautifulSoup
def interest_retrieval(filename):
with open(f'{filename}', 'r') as html_file:
content = html_file.read()
soup = BeautifulSoup(content, 'lxml')
interests = soup.find_all('h2')
with open('interest.txt', 'w') as file:
for interest in interests:
file.write(f'{interest.text}')
print(interest.text)
Edit: Here is the .docx version for the updated question:
from bs4 import BeautifulSoup
import docx
def interest_retrieval(filename):
with open(f'{filename}', 'r') as html_file:
content = html_file.read()
soup = BeautifulSoup(content, 'lxml')
interests = soup.find_all('h2')
mydoc = docx.Document()
for interest in interests:
mydoc.add_paragraph(f'{interest.text}')
print(interest.text)
mydoc.save("C:/Users\satam\PycharmProjects\pythonProject\Web Scraper\list.docx")
N. B. that the docx module can be installed by pip install python-docx.
I am trying to scrape some text from a webpage and saving them in a text file using following code (I am opening links from a text file called links.txt):
import requests
import csv
import random
import string
import re
from bs4 import BeautifulSoup
#Create random string of specific length
def randStr(chars = string.ascii_uppercase + string.digits, N=10):
return ''.join(random.choice(chars) for _ in range(N))
with open("links.txt", "r") as a_file:
for line in a_file:
stripped_line = line.strip()
endpoint = stripped_line
response = requests.get(endpoint)
data = response.text
soup = BeautifulSoup(data, "html.parser")
for pictags in soup.find_all('col-md-2'):
lastfilename = randStr()
file = open(lastfilename + ".txt", "w")
file.write(pictags.txt)
file.close()
print(stripped_line)
the webpage has following attribute:
<div class="col-md-2">
The problem is after running the code noting is happening and I am not receiving any error.
To get all keyword text from the page into a file, you can do:
import requests
from bs4 import BeautifulSoup
url = "http://www.mykeyworder.com/keywords?tags=dog&exclude=&language=en"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
with open("data.txt", "w") as f_out:
for inp in soup.select('input[type="checkbox"]'):
print(inp["value"], file=f_out)
This creates data.txt with content:
dog
animal
canine
pet
cute
puppy
happy
young
adorable
...and so on.
From the documentation of BeautifulSoup here, you can see your line for pictags in soup.find_all('col-md-2') will search for any element with tag name 'col-md-2' not element with that class name. In other word, your code will search element like so <col-md-2></col-md-2>.
You fix your code and try again or pictags in soup.find_all(class_='col-md-2')
you can match the elements with relevant attributes.
pass a dictionary to the attrs parameter of find_all with the
desired attributes of the elements you’re looking for.
pictags = soup.find_all(attrs={'class':'col-md-2'})
this will find all elements with class 'col-md-2'
I'm very new to Python and I'm trying to code a program to extract text inside html tags (without tags) and write it onto a different text file for future analysis. I referred this and this as well. I came was able to get below code. But how can I write this as a separate function? Something like
"def read_list('file1.txt')
and then do the same scraping? The reason why I'm asking is output of this code (op1.txt) will be used for stemming and then for another calculations afterwards. The output of this code doesn't write line by line as it intends either. Thank you very much for any input!
f = open('file1.txt', 'r')
for line in f:
url = line
html = urlopen(url)
bs = BeautifulSoup(html, "html.parser")
content = bs.find_all(['title','h1', 'h2','h3','h4','h5','h6','p'])
with open('op1.txt', 'w', encoding='utf-8') as file:
file.write(f'{content}\n\n')
file.close()
Try like this
from urllib.request import urlopen
from bs4 import BeautifulSoup
def read_list(fl):
with open(fl, 'r') as f:
for line in f:
html = urlopen(line.strip()).read().decode("utf8")
bs = BeautifulSoup(html, "html.parser")
content = '\n'.join([x.text for x in bs.find_all(['title','p']+[f'h{n}' for n in range(1,7)])])
with open('op1.txt', 'w', encoding='utf-8') as file:
file.write(f'{content}\n\n')
So I have a script that extracts all links from a web site, I thought that converting to a list would do the job of making sure I only returned unique links, but there are still dups in the output (ie 'www.commerce.gov/' and 'www.commerce.gov') the code is not picking up the trailing characters. Below is my code. Any help is appreciated. Thanks.
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import csv
req = Request("https://www.census.gov/programs-surveys/popest.html")
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
prettyhtml = soup.prettify()
Html_file = open("U:\python_intro\popest_html.txt","w")
Html_file.write(prettyhtml)
Html_file.close()
links = []
for link in soup.findAll('a', attrs={'href': re.compile(r'^(?:http|ftp)s?://')}):
links.append(link.get('href'))
links = set(links)
myfile = "U:\python_stuff\links.csv"
with open(myfile, "w") as output:
writer = csv.writer(output, lineterminator='\n')
for a in links:
writer.writerow([a])
You mean "converting to a set" not a list.
You can remove any possible trailing '/':
links.append(link.get('href').rstrip('/'))
Or even better, build a set from the first place:
links = set()
for link in soup.findAll('a', attrs={'href': re.compile(r'^(?:http|ftp)s?://')}):
links.add(link.get('href').rstrip('/'))
I'm working on a scraper for a number of chinese documents. As part of the project I'm trying to scrape the body of the document into a list and then write an html version of the document from that list (the final version will include metadata as well as the text, along with a folder full of individual html files for the documents).
I've managed to scrape the body of the document into a list and then use the contents of that list to create a new HTML document. I can even view the contents when I output the list to a csv (so far so good....).
Unfortunately the HTML document that is output is all "\u6d88\u9664\u8d2b\u56f0\u3001\".
Is there a way to encode the output so that this won't happen? Do I just need to grow up and scrape the page for real (parsing and organizing it <p> by <p> instead of just copying all of the exiting HTML as is) and then build the new HTML page element by element?
Any thoughts would be most appreciated.
from bs4 import BeautifulSoup
import urllib
#csv is for the csv writer
import csv
#initiates the dictionary to hold the output
holder = []
#this is the target URL
target_url = "http://www.gov.cn/zhengce/content/2016-12/02/content_5142197.htm"
data = []
filename = "fullbody.html"
target = open(filename, 'w')
def bodyscraper(url):
#opens the url for read access
this_url = urllib.urlopen(url).read()
#creates a new BS holder based on the URL
soup = BeautifulSoup(this_url, 'lxml')
#finds the body text
body = soup.find('td', {'class':'b12c'})
data.append(body)
holder.append(data)
print holder[0]
for item in holder:
target.write("%s\n" % item)
bodyscraper(target_url)
with open('bodyscraper.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(holder)
As the source htm is utf-8 encoded, when using bs just decoding what urllib returns which will work. I have tested both of html and csv output will show Chinese characters, here is the amended code:
from bs4 import BeautifulSoup
import urllib
#csv is for the csv writer
import csv
#initiates the dictionary to hold the output
holder = []
#this is the target URL
target_url = "http://www.gov.cn/zhengce/content/2016-12/02/content_5142197.htm"
data = []
filename = "fullbody.html"
target = open(filename, 'w')
def bodyscraper(url):
#opens the url for read access
this_url = urllib.urlopen(url).read()
#creates a new BS holder based on the URL
soup = BeautifulSoup(this_url.decode("utf-8"), 'lxml') #decoding urllib returns
#finds the body text
body = soup.find('td', {'class':'b12c'})
target.write("%s\n" % body) #write the whole decoded body to html directly
data.append(body)
holder.append(data)
bodyscraper(target_url)
with open('bodyscraper.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(holder)