Automating Blog Article Access and Statistics with Python
This tutorial demonstrates how to programmatical control web browsers using Python to automate the opening of blog articles, collect statistics, and manage browser processes efficiently.
Basic Browser Automation
The following approach automatically launches your default browser and opens specific web pages in new tabs. Note that this implementation handles single-page listings; pagination support can be added as a future enhancement.
import webbrowser
import time
import random
import os
from bs4 import BeautifulSoup
import requests
def fetch_article_links(listing_url):
"""
Simulate browser headers to retrieve page source and extract article URLs.
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
try:
response = requests.get(listing_url, headers=headers)
response.raise_for_status()
except requests.RequestException as e:
print(f"Error fetching page: {e}")
return []
soup = BeautifulSoup(response.text, 'lxml')
article_urls = []
base_domain = 'https://example-blog.com'
# Extract article links (adjust selectors based on target site structure)
for entry in soup.find_all('div', class_='list_item list_view'):
link_tag = entry.find('a')
if link_tag and link_tag.get('href'):
relative_path = link_tag['href']
full_url = base_domain + relative_path if not relative_path.startswith('http') else relative_path
article_urls.append(full_url)
return article_urls
def sequential_browser_launch(url_collection):
"""
Open each URL in a new browser tab with delay to prevent system overload.
"""
if not url_collection:
return
for idx, page_url in enumerate(url_collection):
webbrowser.open_new_tab(page_url)
time.sleep(1) # Prevent excessive resource consumption
# Cleanup after complete iteration
time.sleep(2)
os.system('taskkill /f /im chrome.exe')
print(f"Browser session completed. Processed {len(url_collection)} articles.")
# Usage example
target_page = 'https://example-blog.com?viewmode=contents'
links = fetch_article_links(target_page)
sequential_browser_launch(links)
Article Statistics and Sorting
This enhanced version counts total articles, extracts view statistics, and sorts content by popularity (ascending order) before automated browsing.
import requests
import re
import time
import random
import os
from bs4 import BeautifulSoup
import webbrowser
def scrape_and_sort_by_popularity(blog_url):
"""
Extract articles, sort by view count, and persist metadata.
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
try:
resp = requests.get(blog_url, headers=headers)
resp.raise_for_status()
except requests.RequestException as err:
print(f"Request failed: {err}")
return []
parsed_html = BeautifulSoup(resp.text, 'lxml')
entries = parsed_html.find_all('div', class_='article-item-box')
print(f"Total articles discovered: {len(entries)}")
article_metadata = []
for item in entries:
try:
title_element = item.find('h4').find('a')
article_url = title_element.get('href')
title_text = title_element.text.strip()
# Extract view count from statistics container
stats_div = item.find("div", class_="info-box")
view_element = stats_div.find('span', class_='read-num')
view_string = view_element.text.strip()
view_count = int(re.sub(r"\D", "", view_string))
article_metadata.append({
'views': view_count,
'title': title_text,
'url': article_url,
'view_display': view_string
})
except (AttributeError, ValueError):
continue
# Sort by view count ascending (least popular first)
sorted_articles = sorted(article_metadata, key=lambda x: x['views'])
# Persist to file and extract URLs
output_file = 'article_rankings.txt'
urls = []
with open(output_file, 'w', encoding='utf-8') as f:
for article in sorted_articles:
line = f"{article['title']}\t{article['view_display']}\t{article['url']}\n"
f.write(line)
urls.append(article['url'])
print(line.strip())
return urls
# Execute data collection
sorted_urls = scrape_and_sort_by_popularity('https://example-blog.com?viewmode=contents')
Batch Processing with Process Management
The following implementation adds batch cycling with random iteration counts and explicit browser process termination to manage memory usage.
def cyclic_browsing_session(url_list, max_cycles=3, batch_size=5):
"""
Execute automated browsing with random cycle counts and process cleanup.
"""
if not url_list:
return
cycle_count = random.randint(1, max_cycles)
print(f"Initiating browsing session: {cycle_count} cycles planned")
for cycle in range(cycle_count):
# Open subset of articles
for index in range(min(batch_size, len(url_list))):
webbrowser.open_new_tab(url_list[index])
time.sleep(1)
# Terminate browser process to prevent memory accumulation
os.system('taskkill /f /im chrome.exe')
print(f"Completed cycle {cycle + 1}/{cycle_count}")
# Run the automated session
cyclic_browsing_session(sorted_urls, max_cycles=10)
Python 3 Implemantation Notes
For modern Python 3 environments, use the requests library instead of urllib2 for improved error handling and readability. Key differences include:
- Use
requests.get()instead ofurllib2.urlopen() - Exception handling uses
askeyword instead of comma syntax - File operations should specify
encoding='utf-8' - Sorting can use
sorted()with lambda functions rather thancmp_to_key
The rewritten examples above are fully compatible with Python 3.8+ and use contemporary libraries and syntax patterns.
Important Considerations
-
Process Termination: The
taskkillcommand is Windows-specific. For cross-platform compatibility, consider usingpsutilor OS-appropriate commands. -
Rate Limiting: Always include delays (
time.sleep()) between requests to avoid overwhelming target servers. -
Selector Adaptation: CSS class selectors (e.g.,
article-item-box,read-num) vary by blogging platform and may require inspection of the specific target HTML structure. -
Ethical Usage: Ensure automated browsing complies with the target website's Terms of Service and robots.txt policies.