I have a text file which I read in and then I extract the data I require and try sending it to a different new text file, but only the first line gets into the new text file.
import requests
url_file = open('url-test.txt','r')
out_file = open('url.NDJSON','w')
for url in url_file.readlines():
html = requests.get(url).text
out_file.writelines(html)
out_file.close()
try:
for url in url_file.readlines():
html = requests.get(url).text
out_file.write(html)
or
lines = []
for url in url_file.readlines():
html = requests.get(url).text
# verify you are getting the expected data
print(111111, html)
lines.append(html)
out_file.writelines(lines)
either append the string in html or use the writelines statement in for loop
Related
I want to open a txt file (which contains multiple links) and scrap title using beautifulsoup.
My txt file contains link like this:
https://www.lipsum.com/7845284869/
https://www.lipsum.com/56677788/
https://www.lipsum.com/01127111236/
My code:
import requests as rq
from bs4 import BeautifulSoup as bs
with open('output1.csv', 'w', newline='') as f:
url = open('urls.txt', 'r', encoding='utf8')
request = rq.get(str(url))
soup = bs(request.text, 'html.parser')
title = soup.findAll('title')
pdtitle = {}
for pdtitle in title:
pdtitle.append(pdtitle.text)
f.write(f'{pdtitle}')
I want to open all txt file links and scrap title from the links. The main problem is opening txt file in url variable is not working. How to open a file and save data to csv?
you code isn't working because inside URL is all the URL. you need to run one by one:
import requests as rq
from bs4 import BeautifulSoup as bs
with open(r'urls.txt', 'r') as f:
urls = f.readlines()
with open('output1.csv', 'w', newline='') as f:
for url in urls:
request = rq.get(str(url))
soup = bs(request.text, 'html.parser')
title = soup.findAll('title')
pdtitle = {}
for pdtitle in title:
pdtitle.append(pdtitle.text)
f.write(f'{pdtitle}')
Your urls may not be working because your urls are being read with a return line character: \n. You need to strip the text before putting them in a list.
Also, you are using .find_all('title'), and this will return a list, which is probably not what you are looking for. You probably just want the first title and that's it. In that case, .find('title') would be better. I have provided some possible corrections below.
from bs4 import BeautifulSoup
import requests
filepath = '...'
with open(filepath) as f:
urls = [i.strip() for i in f.readlines()]
titles = []
for url in urls:
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
title = soup.find('title') # Note: will find the FIRST title only
titles.append(title.text) # Grabs the TEXT of the title only, removes HTML
new_csv = open('urls.csv', 'w') # Make sure to prepend with desired location, e.g. 'C:/user/name/urls.csv'
for title in titles:
new_csv.write(title+'\n') # The '\n' ensures a new row is written
new_csv.close()
f.close()
the code below only give me the last word in the list
import csv
wo = csv.reader(open('WORD.csv') )
row=list(wo)
from bs4 import BeautifulSoup as soup # HTML data structure
from urllib.request import urlopen as uReq # Web client
# URl to web scrape from.
# in this example we web scrape lexico
with open("WORD.csv") as f:
for row in csv.reader(f):
for word in row:
# Number of pages plus one
url = "https://www.lexico.pt/{}".format(word)
# opens the connection and downloads html page from url
uClient = uReq(url)
page_html = uClient.read()
# parses html into a soup data structure to traverse html
# as if it were a json data type.
page_soup = soup(page_html, "html.parser")
# finds each product from the store page
containers = page_soup.find("div", {"class": "card card-pl card-pl-significado"})
# name the output file to write to local disk
out_filename = "test.csv"
# opens file, and writes headers
f = open(out_filename, "w")
Word = containers.h2.text
Defention = containers.p.text
f.write("\n" + Word + ", " + Defention + "\n")
f.close()
Please help I tried everything. I am a beginner to BeautifulSoup so sorry for my terrible code format
As I mentioned earlier, I believe that you have already achieved your goal.
In python, the scope is determined by indenting. This defines the area of validity of the local variables. Since you do not follow this continuously in your example, the iteration is already complete when your first request is sent. The loop variable has already been reassigned and contains the result of the last iteration step.
# open files for reading and writing
with open('WORD.csv') as src, open('test.txt', 'w') as dst:
# read row by row
for row in csv.reader(src):
# get words separated by comma
for word in row:
# open connection and create parser with read data
url = f'https://www.lexico.pt/{word}'
resp = urlopen(url)
html = soup(resp.read(), 'html.parser')
# find card/content
card = html.find('div', {'class':'card-pl-significado'})
word = card.h2.text
desc = card.p.text
# write formatted result to file
dst.write(f'{word}, {desc}\n')
Have fun
I am trying to extract a URL from a text file which contains a source code of a website. I want to get the website link inside href and I wrote some code I borrowed from stackoverflow but I can't get it to work.
with open(sourcecode.txt) as f:
urls = f.readlines()
urls = ([s.strip('\n') for s in urls ])
print(url)
Using a regexp, you can extract all urls from the text file, without the need to loop line by line:
import re
with open('/home/username/Downloads/Stack_Overflow.html') as f:
urls = f.read()
links = re.findall('"((http)s?://.*?)"', urls)
for url in links:
print(url[0])
You can use regular expressions for this.
import re
with open('sourcecode.txt') as f:
text = f.read()
href_regex = r'href=[\'"]?([^\'" >]+)'
urls = re.findall(href_regex, text)
print(urls)
You're probably getting an error like 'sourcecode' is not defined; this is because the parameter that you pass to open() needs to be a string (see above)
from urllib.request import urlopen
from bs4 import BeautifulSoup
#specify the url
wiki = "http://www.bbc.com/urdu"
#Query the website and return the html to the variable 'page'
page = urlopen(wiki)
#Parse the html in the 'page' variable, and store it in Beautiful Soup format
soup = BeautifulSoup(page,"html.parser")
all_links=soup.find_all("a")
for link in all_links:
#print (link.get("href"))
#text=soup.body.get_text()
#print(text)
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text=soup.body.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
print(text)
text1 = str(text)
text_file = open("C:\\Output.txt", 'w')
text_file.write(text)
text_file.close()
I want to extract data from a news website using beautiful soup. I wrote a code, but it is not giving me the required output. Firstly, I have to process all the links in a page and then extract data from that and save it to a file. Then, more on to next page and extract data and save it and so on… Right now, I was just trying to process links on first page, but it is not giving me the full text and also it is giving me some tags in output.
To extract all links from a website you can try something like this:
data = []
soup = BeautifulSoup(page,"html.parser")
for link in soup.find_all('a', href=True):
data.append(link['href'])
text = '\n'.join(data)
print(text)
And then proceed to save text into a file. After this you need to iterate over data to get all the urls for those websites aswell.
I'm working on a scraper for a number of chinese documents. As part of the project I'm trying to scrape the body of the document into a list and then write an html version of the document from that list (the final version will include metadata as well as the text, along with a folder full of individual html files for the documents).
I've managed to scrape the body of the document into a list and then use the contents of that list to create a new HTML document. I can even view the contents when I output the list to a csv (so far so good....).
Unfortunately the HTML document that is output is all "\u6d88\u9664\u8d2b\u56f0\u3001\".
Is there a way to encode the output so that this won't happen? Do I just need to grow up and scrape the page for real (parsing and organizing it <p> by <p> instead of just copying all of the exiting HTML as is) and then build the new HTML page element by element?
Any thoughts would be most appreciated.
from bs4 import BeautifulSoup
import urllib
#csv is for the csv writer
import csv
#initiates the dictionary to hold the output
holder = []
#this is the target URL
target_url = "http://www.gov.cn/zhengce/content/2016-12/02/content_5142197.htm"
data = []
filename = "fullbody.html"
target = open(filename, 'w')
def bodyscraper(url):
#opens the url for read access
this_url = urllib.urlopen(url).read()
#creates a new BS holder based on the URL
soup = BeautifulSoup(this_url, 'lxml')
#finds the body text
body = soup.find('td', {'class':'b12c'})
data.append(body)
holder.append(data)
print holder[0]
for item in holder:
target.write("%s\n" % item)
bodyscraper(target_url)
with open('bodyscraper.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(holder)
As the source htm is utf-8 encoded, when using bs just decoding what urllib returns which will work. I have tested both of html and csv output will show Chinese characters, here is the amended code:
from bs4 import BeautifulSoup
import urllib
#csv is for the csv writer
import csv
#initiates the dictionary to hold the output
holder = []
#this is the target URL
target_url = "http://www.gov.cn/zhengce/content/2016-12/02/content_5142197.htm"
data = []
filename = "fullbody.html"
target = open(filename, 'w')
def bodyscraper(url):
#opens the url for read access
this_url = urllib.urlopen(url).read()
#creates a new BS holder based on the URL
soup = BeautifulSoup(this_url.decode("utf-8"), 'lxml') #decoding urllib returns
#finds the body text
body = soup.find('td', {'class':'b12c'})
target.write("%s\n" % body) #write the whole decoded body to html directly
data.append(body)
holder.append(data)
bodyscraper(target_url)
with open('bodyscraper.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(holder)