python script - grouping words into a If-Not statement - python

Trying to figure out how to use an if not statement in which I can group three to four words to omit from a CSV file. Towards the bottom of the code, you'll see that I'm stuck at: if ('reddit', 'passwords') not in x:
Any help would be great.
# import libraries
import bs4
from urllib2 import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.reddit.com/r/NHLStreams/comments/71uhwi/game_thread_sabres_at_maple_leafs_730_pm_et/'
# opening up connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
filename = "sportstreams.csv"
f = open(filename, "w")
headers = "Sport Links " + "\n"
f.write(headers)
links = page_soup.select("form a[href]")
for link in links:
href = link["href"]
print(href)
f.write(href + "\n")
with open('sportstreams.csv') as f,open('sstream.csv', "w") as f2:
for x in f:
if ('reddit', 'passwords') not in x: # trying to find multi words to omit
f2.write(x.strip()+'\n')

Use the builtin function all:
if all(t not in x for t in ('reddit', 'passwords')):
Or any:
if not any(t in x for t in ('reddit', 'passwords')):
Here's it is in your context manager:
with open('sportstreams.csv') as f, open('sstream.csv', "w") as f2:
for line in f:
if any(t in line for t in ('reddit', 'passwords')):
# The line contains one of the strings.
continue
else:
# The line contains none of the strings.
f2.write(line.strip() + '\n')

Related

I'm trying to deduplicate weblinks scraped using Python & BeautifulSoup but it's not working

I'm trying to scrape a website in Python, I got the links to print but in trying to make them a set to deduplicate, there are still duplicates. Anyone have any advice on what I am doing wrong? Thanks in advance!
Edit: So I tried what John suggested but my csv output is a cascading list of links across the excel sheet, it's crazy...I'll post the changes below this original code:
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink) )
#output = The number of links to start with are: 254
import csv
with open('census_links.csv', 'w', newline='') as f:
weblinks = str(mylink)
writer = csv.writer(f, delimiter = ' ', lineterminator = '\r')
for link in mylink:
hrefs = str(link.get('href'))
if hrefs.startswith("None"):
''
elif hrefs.startswith('http'):
MySet = set()
MySet.add(hrefs)
elif hrefs.startswith('#'):
''
elif hrefs.startswith(' '):
''
print(set(MySet))
file.write(str(MySet)+'\n')
file.close
#Edited code:
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink))
# The number of links to start with are: 254
import csv
with open('census_links.csv', 'w', newline='') as f:
weblinks = str(mylink)
writer = csv.writer(f, delimiter = ',', lineterminator = '\r')
MySet = set()
for link in mylink:
hrefs = str(link.get('href'))
if hrefs.startswith("None"):
continue
elif hrefs.startswith('#'):
continue
elif hrefs.startswith(' '):
continue
elif hrefs.startswith('http'):
MySet.add(hrefs)
file.write(str(MySet)+'\n')
file.close
print(str(MySet) +'\n')
to get unique links, you want to check if the link is in MySet with hrefs not in MySet.
for simple operation you don't need csv, to write in single row
"\n".join(MySet)
and to write single column
",".join(MySet)
MySet = set()
for link in mylink:
hrefs = link.get('href')
if not hrefs or hrefs.startswith('#'):
continue
# normalize link
if hrefs.startswith('/'):
hrefs = 'https://www.census.gov' + hrefs
# check if link already in MySet
if hrefs not in MySet:
MySet.add(hrefs)
with open('census_links.csv', 'w', newline='') as f:
f.write("\n".join(MySet))
print("\n".join(MySet))
Initialize the set before the loop, and wait to print it until after the loop is done.
MySet = set()
...
for link in mylink:
hrefs = str(link.get('href'))
...
if hrefs.startswith('http'):
MySet.add(hrefs)
...
print(MySet)
same code part to get content.
import requests
from bs4 import BeautifulSoup
page = "https://www.census.gov/programs-surveys/popest.html"
r = requests.get(page)
raw_html = r.text
soup = BeautifulSoup(raw_html, 'html.parser')
mylink = soup.find_all('a')
print ('The number of links to start with are: ', len(mylink) )
#output = The number of links to start with are: 254
use pandas to get the unique url which starts with http.
import pandas as pd
obj = pd.Series(mylink)
obj_link = obj.map(lambda x: x.get('href')).drop_duplicates().dropna()
cond = obj_link.str.startswith('http')
dfn = obj_link.loc[cond].to_frame()
dfn.shape # (93, 1)
dfn.to_csv('census_links.csv', index=False, header=False)

Python code printing only one row in csv file

Recently I've tried to code a yp.com list scraper. But could not figure out why the code is printing only one row in the .csv file.
yp_urls.txt urls are:
https://www.yellowpages.com/search-map?search_terms=restaurant&geo_location_terms=Boston
https://www.yellowpages.com/search-map?search_terms=restaurant&geo_location_terms=Boston&page=2
Here is the code:
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
with open('yp_urls.txt', 'r') as f:
for url in f:
print(url)
uClient = urlopen(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"v-card"})
#container= containers[0]
out_filename = "yp_listing.csv"
headers = "URL \n"
f = open(out_filename, "w")
f.write(headers)
for container in containers:
business = container.a["href"].title()
print("business:" + business + "\n" )
f.write(business + "," + "\n")
f.close() # Close the file
Issues:
Code for your if blocks wasn't properly indented.
Open output file handle outside the for loop.
Try:
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
out_filename = "yp_listing.csv"
with open('yp_urls.txt', 'r') as f, open(out_filename, "w") as fout:
headers = "URL \n"
fout.write(headers)
for url in f:
print(url)
uClient = urlopen(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"v-card"})
#container= containers[0]
for container in containers:
business = container.a["href"].title()
print("business:" + business + "\n" )
fout.write(business + "," + "\n")
#f.close() # Close the file (closed by with)
It appears that the f.write commands are outside of your loops, so are only being hit once the loops are completed.
For example, the code loops through the urls, then exits the loop and executes f.write(headers), then loops through containers, exits that loop and f.write(business:..)
You may also wish to check if the output file is being opened in right state with 'w' (write/overwrite) versus 'a' (append). Perhaps also consider changing the handles so both are not 'f'.

in python, what should I add to fetch URLs form my (text file) or my (xml file) which include list of URLs?

I have this code which is all work fine with (one link )
Result of the code store values (availableOffers,otherpricess,currentprice,page_url) in (prices.csv) file
my problems are : First : I do not know what to write to fetch URLs form my (text file) or my (xml file) instead of one URL in this code
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
page_url = "XXXXXXXXX"
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
availableOffers = page_soup.find("input", {"id": "availableOffers"})["value"]
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "")
currentprice = page_soup.find("div", {"class": "is"}).text.strip().replace("$", "")
out_filename = "prices.csv"
headers = "availableOffers,otherpricess,currentprice,page_url \n"
f = open(out_filename, "w")
f.write(headers)
f.write(availableOffers + ", " + otherpricess + ", " + currentprice + ", " + page_url + "\n")
f.close()
Second problem : when URL do not have value for (otherpricess ) I get this error
line 13, in <module>
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "")
AttributeError: 'NoneType' object has no attribute 'text'
how I bypass this error and tell the code to work even there are a value missing
thanks
To fetch urls from text file, you can open a file (exactly as you did for write) in "r" mode, and iterate over it's line.
For example, lets say you have the following urls file, named urls.txt:
http://www.google.com
http://www.yahoo.com
In order to fetch the urls and iterate over them, do the following:
out_filename = "prices.csv"
headers = "availableOffers,otherpricess,currentprice,page_url \n"
with open(out_filename, "w") as fw:
fw.write(headers)
with open("urls.txt", "r") as fr:
for url in map(lambda x: x.strip(), fr.readlines()): # the strip is to remove the trailing '\n'
print(url)
uClient = uReq(url)
page_soup = soup(uClient.read(), "html.parser")
# write the rest logic here
# ...
# write to the output file
fw.write(availableOffers + ", " + otherpricess + ", " + currentprice + ", " + page_url + "\n")
Regarding your second question, you can check that page_soup.find("span", {"class": "price"}) is not None and if so, extract the text. For example:
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "") if page_soup.find("span", {"class": "price"}) else ""
# in case there is no value, otherpricess will be empty string but you can change it to any other value.

Why does my word counter produce a different output the first time I run it compared to the second time?

I am doing a basic project for practice. I call a simple wikipedia page, and then I write everything into a text file using Beautiful Soup. Then I count then number of times a word appears in that newly written text file
For some reason, the first time I run the code, I get a different number than the second time I run the code.
I believe that the first time I run the code, the "anime.txt" is different than the second time I run the code.
The problem must be with the way I gather all of my text data with Beautiful Soup.
Please help
from urllib.request import urlopen
from bs4 import BeautifulSoup
f = open("anime.txt", "w", encoding="utf-8")
f.write("")
f.close()
my_url ="https://en.wikipedia.org/wiki/Anime"
uClient = urlopen(my_url)
page_html = uClient.read()
uClient.close()
page_soup = BeautifulSoup(page_html, "html.parser")
p=page_soup.findAll("p")
f = open("anime.txt", "a", encoding="utf-8")
for i in p:
f.write(i.text)
f.write("\n\n")
data= open("anime.txt", encoding="utf-8").read()
anime_count = data.count("anime")
Anime_count = data.count("Anime")
print(anime_count,"\n")
print(Anime_count, "\n")
count= anime_count+Anime_count
print("The total number of times the word Anime appears within <p> in the wikipedia page is : ", count)
first output:
anime_count = 14
Anime_count = 97
count = 111
second output:
anime_count = 23
Anime_count = 139
count = 162
EDIT:
I edited my code base on the first 2 comments, and of course, it works now :P.
Does this look better in regards to Opening and Closing the file the proper way/number of times?
from urllib.request import urlopen
from bs4 import BeautifulSoup
my_url ="https://en.wikipedia.org/wiki/Anime"
uClient = urlopen(my_url)
page_html = uClient.read()
uClient.close()
page_soup = BeautifulSoup(page_html, "html.parser")
p=page_soup.findAll("p")
f = open("anime.txt", "w", encoding="utf-8")
for i in p:
f.write(i.text)
f.write("\n\n")
f.close()
data= open("anime.txt", encoding="utf-8").read()
anime_count = data.count("anime")
Anime_count = data.count("Anime")
print(anime_count,"\n")
print(Anime_count, "\n")
count= anime_count+Anime_count
print("The total number of times the word Anime appears within <p> in the wikipedia page is : ", count)
Do not get confused about opening and closing files. Include all the writing/reading parts in with statements.
from urllib.request import urlopen
from bs4 import BeautifulSoup
with open("anime.txt", "w", encoding="utf-8") as outfile:
my_url ="https://en.wikipedia.org/wiki/Anime"
uClient = urlopen(my_url)
page_html = uClient.read()
uClient.close()
page_soup = BeautifulSoup(page_html, "html.parser")
p=page_soup.findAll("p")
for i in p:
outfile.write(i.text)
outfile.write("\n\n")
with open("anime.txt", "r", encoding="utf-8") as infile:
data = infile.read()
anime_count = data.count("anime")
Anime_count = data.count("Anime")
print(anime_count,"\n")
print(Anime_count, "\n")
count= anime_count+Anime_count
print("The total number of times the word Anime appears within <p> in the wikipedia page is : ", count)s : ", count)

python 3.6 get text from list

I am an absolute beginner, but I have managed to make a working script out of some existing scripts and tutorials. Only one thing I would like to have, unfortunately I can not do that.
So far, I'm getting data from a website that is, for example, "http://www.example.com/01536496/.../". Now I have a list (.csv or .txt) with many other numbers in the first column (or in txt-file each number in a new row). Now I want to scrape the web data for all the numbers in the list, so "http://www.example.com/No_1/.../", "http://www.example.com/No_2/.../" and so on.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import datetime
my_url = 'http://www.example.com/104289633/.../'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
...
Update
For example I have a numbers.txt with: 05543486 3468169 36189994
Now I want to put each number into the url...
Please can someone help me. I would be very grateful.
Update
After trying to use the code from Andersson...
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import datetime
# Get list of numbers
with open("numbers.txt") as f:
content = f.read()
numbers = content.split()
# Handle each URL in a loop
for number in numbers:
my_url = 'https://www.immobilienscout24.de/expose/%s#/' %number
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
print(my_url)
page_soup = soup(page_html, "html.parser")
containers = page_soup.find_all("div", {"class":"grid-item padding-desk-right-xl desk-two-thirds lap-one-whole desk-column-left flex-item palm--flex__order--1 lap--flex__order--1"})
filename = "results_"+current_datetime+".csv"
f = open(filename, "w")
headers = "titel##adresse##criteria##preis##energie##beschreibung##ausstattung##lage\n"
f.write(headers)
...
f.write(titel + "##" + adresse + "##" + criteria.replace(" ", "; ") + "##" + preis.replace(" ", "; ") + "##" + energie.replace(" ", "; ") + "##" + beschreibung.replace("\n", " ") + "##" + ausstattung.replace("\n", " ") + "##" + lage.replace("\n", " ") + "\n")
f.close()
You can use below code:
# Get list of numbers
with open("/path/to/numbers.txt") as f:
content = f.read()
numbers = content.split()
# Handle each URL in a loop
for number in numbers:
url = 'http://www.example.com/%s' % number
# Do something with url
You can create a function that runs a for loop and update the url on each iteration through the loop. As the argument, you can pass the list of numbers. For example:
def scrape(numbers):
for num in numbers:
my_url = 'http://www.example.com/No_' + str(num) + '/.../'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
numbers_list = [1, 2, 3, 4, 5]
scrape(numbers_list)
You can achieve this by appending the numbers at the end of your url with a basic for loop ? I am not sure if this is what you need.
...
with open('yourFile', 'r') as numbersFile:
nums = numbers.readlines()
for num in nums:
url = "http://www.example.com/No_" + num + "/.../"
# do what you want to do with the url...
Load from csv file
You can iterate over the file rows in various ways, but what I think is the most clean one is by using pandas.
You just need to do this:
import pandas as pd
df = pd.read_csv("filename.csv")
# assuming that filename.csv's first line has a header called "Numbers"
# You can apply a function `func` to each element of the column via `map`
df['Numbers'].map(func)
Urls from Numbers
Using pandas' map function, we can pass each value to a function to create our url.
# First of all, we define this function
def numberToUrl(number):
# We can use python's `string.format()` to format a string
return 'http://www.example.com/{}/.../'.format(number)
# Then we can pass this function to each value with `map`
# and assign the result to a new column
df['url'] = df['Numbers'].map(numberToUrl)
# We can print the first 5 elements via:
df.head()
As you can see, it's extremely simple to pass a function to each row.
If you want to iterate over the rows you can do it like so:
for (index, row) in df['url'].iteritems():
# Do your operations here
In your case it would be something like this:
for (index, row) in df['url'].iteritems():
uClient = uReq(row)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
# ...
Additional notes
I would not recommend to use urllib.request directly. Instead you could use a wrapper library called requests

Categories

Resources