Get data of a XML with criteria - python

i have the following code:
import pandas as pd
import urllib.parse
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
url = 'http://windte2001.acepta.com/v01/E67EBB4910CFDCB067EB7D85FBA6E5511D0E64A9'.replace('/v01/', '/depot/')
x = urlopen(url)
new = x.read()
soup = BeautifulSoup(new, "lxml-xml")
result = soup.find_all(['NmbItem','QtyItem'])
that brings the next result in xml
<NmbItem>SERV. MANEJO DE LIQUIDOS</NmbItem>, <QtyItem>22.00</QtyItem>, <NmbItem>SERV. MANEJO DE RESPEL</NmbItem>, <QtyItem>1.00</QtyItem>]
All i need if NmbItem contains 'LIQUIDOS' bring me the 'QtyItem' in this case is 22
How can i do this with python in this xml?

You can use regular expression.
import re
from bs4 import BeautifulSoup
soup=BeautifulSoup(new,'xml')
result=soup.find('NmbItem',text=re.compile("LIQUIDOS")).find_next('QtyItem').text
print(result)

You can do like this:
result = soup.find_all(['NmbItem'])
for item in result:
if 'LIQUIDOS' in item.text:
print(list(item.next_siblings)[3].text)

Related

Python REGEX remove string containing substring

I am writing a script that will scrape a newsletter for URLs. There are some URLs in the newsletter that are irrelevant (e.g. links to articles, mailto links, social links, etc.). I added some logic to remove those links, but for some reason not all of them are being removed. Here is my code:
from os import remove
from turtle import clear
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
termSheet = "https://fortune.com/newsletter/termsheet"
html = requests.get(termSheet)
htmlParser = BeautifulSoup(html.text, "html.parser")
termSheetLinks = []
for companyURL in htmlParser.select("table#templateBody p > a"):
termSheetLinks.append(companyURL.get('href'))
for link in termSheetLinks:
if "fortune.com" in link in termSheetLinks:
termSheetLinks.remove(link)
if "forbes.com" in link in termSheetLinks:
termSheetLinks.remove(link)
if "twitter.com" in link in termSheetLinks:
termSheetLinks.remove(link)
print(termSheetLinks)
When I ran it most recently, this was my output, despite trying to remove all links containing "fortune.com":
['https://fortune.com/company/blackstone-group?utm_source=email&utm_medium=newsletter&utm_campaign=term-sheet&utm_content=2022080907am', 'https://fortune.com/company/tpg?utm_source=email&utm_medium=newsletter&utm_campaign=term-sheet&utm_content=2022080907am', 'https://casproviders.org/asd-guidelines/', 'https://fortune.com/company/carlyle-group?utm_source=email&utm_medium=newsletter&utm_campaign=term-sheet&utm_content=2022080907am', 'https://ir.carlyle.com/static-files/433abb19-8207-4632-b173-9606698642e5', 'mailto:termsheet#fortune.com', 'https://www.afresh.com/', 'https://www.geopagos.com/', 'https://montana-renewables.com/', 'https://descarteslabs.com/', 'https://www.dealer-pay.com/', 'https://www.sequeldm.com/', 'https://pueblo-mechanical.com/', 'https://dealcloud.com/future-proof-your-firm/', 'https://apartmentdata.com/', 'https://www.irobot.com/', 'https://www.martin-bencher.com/', 'https://cell-matters.com/', 'https://www.lever.co/', 'https://www.sigulerguff.com/']
Any help would be greatly appreciated!
It do not need a regex in my opinion - Instead of removing the urls, append only those to a list that do not contain your substrings, eg with a list comprehension:
[companyURL.get('href') for companyURL in htmlParser.select("table#templateBody p > a") if not any(x in companyURL.get('href') for x in ["fortune.com","forbes.com","twitter.com"])]
Example
from bs4 import BeautifulSoup
import requests
termSheet = "https://fortune.com/newsletter/termsheet"
html = requests.get(termSheet)
htmlParser = BeautifulSoup(html.text, "html.parser")
myList = ["fortune.com","forbes.com","twitter.com"]
[companyURL.get('href') for companyURL in htmlParser.select("table#templateBody p > a")
if not any(x in companyURL.get('href') for x in myList)]
Output
['https://casproviders.org/asd-guidelines/',
'https://ir.carlyle.com/static-files/433abb19-8207-4632-b173-9606698642e5',
'https://www.afresh.com/',
'https://www.geopagos.com/',
'https://montana-renewables.com/',
'https://descarteslabs.com/',
'https://www.dealer-pay.com/',
'https://www.sequeldm.com/',
'https://pueblo-mechanical.com/',
'https://dealcloud.com/future-proof-your-firm/',
'https://apartmentdata.com/',
'https://www.irobot.com/',
'https://www.martin-bencher.com/',
'https://cell-matters.com/',
'https://www.lever.co/',
'https://www.sigulerguff.com/']
Removing the links after the for iterator will not skip any entry.
from os import remove
from turtle import clear
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
termSheet = "https://fortune.com/newsletter/termsheet"
html = requests.get(termSheet)
htmlParser = BeautifulSoup(html.text, "html.parser")
termSheetLinks = []
for companyURL in htmlParser.select("table#templateBody p > a"):
termSheetLinks.append(companyURL.get('href'))
lRemove = []
for link in termSheetLinks:
if "fortune.com" in link:
lRemove.append(link)
if "forbes.com" in link:
lRemove.append(link)
if "twitter.com" in link:
lRemove.append(link)
for l in lRemove:
termSheetLinks.remove(l)
print(termSheetLinks)

Trying to web scrape all the tables on a web page

import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import requests
from time import sleep
from random import randint
import re
towns = pd.DataFrame()
url = f"https://www.city-data.com/city/Adak-Alaska.html"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
table_data = doc.findAll("td")
#for i in table_data:
#towns.append(table_data[i])
print(table_data)
I'm trying to get the data from the tables, like numbers of adherents to certain religions, ethnic groups, etc. When I look at the source page all that stuff is between the td tags but I'm not seeing it when I print out table_data. What am I doing wrong?
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import requests
from time import sleep
from random import randint
import re
towns = pd.DataFrame()
url = f"https://www.city-data.com/city/Adak-Alaska.html"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
dfs = pd.read_html(page)
for x in dfs:
print(x) ## do what you will with the data
For instance, the religions would be table 17 (dfs[17]):
Religion Adherents Congregations
0 Orthodox 754 6
1 Evangelical Protestant 232 3
2 Catholic 185 1
3 Other 112 1
4 Mainline Protestant 82 1
5 None 4196 -
EDIT: Given the OP's insurmountable issues with his python install, a workaround would be:
url = "https://www.city-data.com/city/Adak-Alaska.html"
soup = BeautifulSoup(requests.get(url).text, "html.parser")
for x in soup.select('table'):
for z in x.select('tr'):
print([y.text.strip() for y in z.find_all(['td', 'th'])])
print('________________')
Results can be further transformed in dataframes.

How i can get random object from list in Python

I have build a list which contains href from website and i wanna randomly select one of this link, how can i do that?
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles = a['href']
x = random.choice(articles)
print(x)
That code work, but selecting only random index from all of the objects
You're very close to the answer. You just need to do this:
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles.append(a['href'])
x = random.choice(articles)
print(x)
getItems()
The changes are:
We add each article to the articles array.
The random choice is now done after the loop, rather than inside the loop.

if string contains from list

I want to check if any of the excluded sites show up. I can get it to work with just one site, but as soon as I make it a list, it errors at if donts in thingy:
TypeError: 'in ' requires string as left operand, not tuple"
This is my code:
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
url = ("http://stackoverflow.com")
donts = ('stackoverflow.com', 'stackexchange.com')
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
thingy = (link.get('href'))
thingy = str(thingy)
if donts in thingy:
pass
else:
print (thingy)
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
url = ("http://stackoverflow.com")
donts = ('stackoverflow.com', 'stackexchange.com')
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
thingy = (link.get('href'))
thingy = str(thingy)
if thingy in donts :
print (thingy)
else:
pass
Judge: string in tuple
The crux of your problem is how you're searching your excluded list:
excluded = ("a", "b", "c")
links = ["a", "d", "e"]
for site in links:
if site not in excluded: # We want to know if the site is in the excluded list
print(f"Site not excluded: {site}")
Reverse the order of your elements and this should work fine. I've inverted your logic here so you can skip the unnecessary pass.
As a side note, this is one reason clear variable names can help - they will help you reason about what the logic should be doing. Especially in Python where ergonomics like in exist, this is very useful.
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
url = ("http://stackoverflow.com")
donts = ('stackoverflow.com', 'stackexchange.com')
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
thingy = (link.get('href'))
thingy = str(thingy)
if any(d in thingy for d in donts):
pass
else:
print (thingy)

scrape blocks of data from webpage API

I try to collect block data which forms a small table from a webpage. Pls see my codes below.
`
import requests
import re
import json
import sys
import os
import time
from lxml import html,etree
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.investing.com/instruments/OptionsDataAjax'
params = {'pair_id': 525, ## SPX
'date': 1536555600, ## 2018-9-4
'strike': 'all', ## all prices
'callspots': 'calls',#'call_andputs',
'type':'analysis', # webpage viewer
'bringData':'true',
}
headers = {'User-Agent': Chrome/39.0.2171.95 Safari/537.36'}
def R(text, end='\n'): print('\033[0;31m{}\033[0m'.format(text), end=end)
def G(text, end='\n'): print('\033[0;32m{}\033[0m'.format(text), end=end)
page = requests.get(url, params=params,headers = headers)
if page.status_code != 200:
R('ERROR CODE:{}'.format(page.status_code))
sys.exit
G('Problem in connection!')
else:
G('OK')
soup = BeautifulSoup(page.content,'lxml')
spdata = json.loads(soup.text)
print(spdata['data'])`
This result--spdata['data'] gives me a str, I just want to get following blocks in this str. There are many such data blocks in this str with the same format.
SymbolSPY180910C00250000
Delta0.9656
Imp Vol0.2431
Bid33.26
Gamma0.0039
Theoretical33.06
Ask33.41
Theta-0.0381
Intrinsic Value33.13
Volume0
Vega0.0617
Time Value-33.13
Open Interest0
Rho0.1969
Delta / Theta-25.3172
I use json and BeautifulSoup here, maybe regular expression will help but I don't know much about re. To get the result, any approach is appreciated. Thanks.
Add this after your code:
regex = r"((SymbolSPY[1-9]*):?\s*)(.*?)\n[^\S\n]*\n[^\S\n]*"
for match in re.finditer(regex, spdata['data'], re.MULTILINE | re.DOTALL):
for line in match.group().splitlines():
print (line.strip())
Outputs
OK
SymbolSPY180910C00245000
Delta0.9682
Imp Vol0.2779
Bid38.26
Gamma0.0032
Theoretical38.05
Ask38.42
Theta-0.0397
Intrinsic Value38.13
Volume0
Vega0.0579
Time Value-38.13
Open Interest0
Rho0.1934
Delta / Theta-24.3966
SymbolSPY180910P00245000
Delta-0.0262
Imp Vol0.2652
...

Categories

Resources