Web Scraping with BeautifulSoup: Node Selection and Traversal Techniques
BeautifulSoup is a Python library for parsing HTML and XML documents, enabling efficient data extraction from web pages.
Installation
# Install BeautifulSoup
pip install beautifulsoup4
# Install lxml parser
pip install lxml
Basic Node Selection
Initializing BeautifulSoup
from bs4 import BeautifulSoup
html_content = """
<html><head><title>Sample Page</title></head>
<body>
<p class="header" name="main"><b>Welcome</b></p>
<p class="content">This is a paragraph with links:
<a href="http://site.com/first" class="link" id="ref1"><!-- First --></a>,
<a href="http://site.com/second" class="link" id="ref2">Second</a> and
<a href="http://site.com/third" class="link" id="ref3">Third</a>.
</p>
<p class="footer">...</p>
"""
soup = BeautifulSoup(html_content, 'lxml')
Accessing Node Elements
# Retrieve the title node
print(soup.title)
# Output: <title>Sample Page</title>
print(type(soup.title))
# Output: <class 'bs4.element.Tag'>
# Get text content of title
print(soup.title.string)
# Output: Sample Page
# Get node name
print(soup.title.name)
# Output: title
Working with Attributes
# Access all attributes of a paragraph
print(soup.p.attrs)
# Output: {'class': ['header'], 'name': 'main'}
# Retrieve a specific attribute
print(soup.p.attrs['name'])
# Output: main
print(soup.p['name'])
# Output: main
# Note: class returns a list
print(soup.p['class'])
# Output: ['header']
Nested Selection
html_nested = """
<html><head><title>Nested Example</title></head>
<body>
"""
soup_nested = BeautifulSoup(html_nested, 'lxml')
print(soup_nested.head.title)
# Output: <title>Nested Example</title>
Advanced Node Traevrsal
Parent and Encestor Nodes
html_parent = """
<html>
<body>
<p class="section">
<a href="http://example.com/item" class="item-link" id="item1">
<span>Item Text</span>
</a>
</p>
</body>
</html>
"""
soup_parent = BeautifulSoup(html_parent, 'lxml')
# Get parent node
print(soup_parent.a.parent)
# Output: <p class="section">...</p>
# Get all ancestors
print(list(soup_parent.a.parents))
# Output: [<p class="section">...</p>, <body>...</body>, <html>...</html>]
Child and Descendant Nodes
html_children = """
<html>
<body>
<p class="list">
Text before link
<a href="http://example.com/alpha" class="entry" id="entry1">
<span>Alpha</span>
</a>
<a href="http://example.com/beta" class="entry" id="entry2">Beta</a>
and
<a href="http://example.com/gamma" class="entry" id="entry3">Gamma</a>
Text after links.
</p>
</body>
</html>
"""
soup_children = BeautifulSoup(html_children, 'lxml')
# Direct children
print(soup_children.p.contents)
# Output includes text and <a> elements
# Iterate through children
for idx, child in enumerate(soup_children.p.children):
print(idx, child)
# All descendants
for idx, descendant in enumerate(soup_children.p.descendants):
print(idx, descendant)
Sibling Nodes
html_siblings = """
<html>
<body>
<p class="paragraph">
Initial text
<a href="http://example.com/one" class="sibling" id="sib1">
<span>One</span>
</a>
Middle text
<a href="http://example.com/two" class="sibling" id="sib2">Two</a>
and
<a href="http://example.com/three" class="sibling" id="sib3">Three</a>
Final text.
</p>
</body>
</html>
"""
soup_siblings = BeautifulSoup(html_siblings, 'lxml')
print('Next sibling:', soup_siblings.a.next_sibling)
print('Previous sibling:', soup_siblings.a.previous_sibling)
print('All next siblings:', list(soup_siblings.a.next_siblings))
print('All previous siblings:', list(soup_siblings.a.previous_siblings))
Method-Based Selection
find_all() Method
html_find = '''
<div class="container">
<div class="header">
<h4>Title</h4>
</div>
<div class="body">
<ul class="items" id="group1">
<li class="item">Apple</li>
<li class="item">Banana</li>
<li class="item">Cherry</li>
</ul>
<ul class="items secondary" id="group2">
<li class="item">Date</li>
<li class="item">Elderberry</li>
</ul>
</div>
</div>
'''
soup_find = BeautifulSoup(html_find, 'lxml')
# Find all ul elements
print(soup_find.find_all(name='ul'))
# Nested search
for ul_element in soup_find.find_all(name='ul'):
for li_element in ul_element.find_all(name='li'):
print(li_element.string)
# Search by attributes
print(soup_find.find_all(attrs={'id': 'group1'}))
print(soup_find.find_all(id='group1'))
print(soup_find.find_all(class_='item'))
# Text search with regex
import re
html_text = '''
<div class="box">
<div class="content">
<a>This contains a link</a>
<a>Another link here</a>
</div>
</div>
'''
soup_text = BeautifulSoup(html_text, 'lxml')
print(soup_text.find_all(text=re.compile('link')))
find() Method
print(soup_find.find(name='ul'))
print(soup_find.find(class_='items'))
Additional Query Methods
find_parents()andfind_parent(): Return ancestor nodesfind_next_siblings()andfind_next_sibling(): Return following sibling nodesfind_previous_siblings()andfind_previous_sibling(): Return preceding sibling nodesfind_all_next()andfind_next(): Return nodes after current nodefind_all_previous()andfind_previous(): Return nodes before current node
CSS Selectors
html_css = '''
<div class="container">
<div class="header">
<h4>Heading</h4>
</div>
<div class="body">
<ul class="items" id="set1">
<li class="element">First</li>
<li class="element">Second</li>
<li class="element">Third</li>
</ul>
<ul class="items minor" id="set2">
<li class="element">Fourth</li>
<li class="element">Fifth</li>
</ul>
</div>
</div>
'''
soup_css = BeautifulSoup(html_css, 'lxml')
# CSS selection
print(soup_css.select('.container .header'))
print(soup_css.select('ul li'))
print(soup_css.select('#set2 .element'))
# Nested selection with CSS
for ul_node in soup_css.select('ul'):
print(ul_node.select('li'))
# Accessing attributes
for ul_node in soup_css.select('ul'):
print(ul_node['id'])
print(ul_node.attrs['id'])
# Retrieving text
for li_node in soup_css.select('li'):
print('Text via get_text():', li_node.get_text())
print('Text via string:', li_node.string)