Exploring Practical Python Libraries for Diverse Applications
BaiduSpider for Image Scraping
BaiduSpider is a library to scraping Baidu search results, supporting various search types including images. Below is a code snippet to download images based on a keyword.
from baiduspider import BaiduSpider
import requests
pages_to_scrape = 5
images_per_page = 10
search_term = 'cat'
for page_index in range(1, pages_to_scrape + 1):
for img_index in range(images_per_page):
try:
search_result = BaiduSpider().search_pic(search_term, pn=page_index)
image_url = search_result['results'][img_index]['url']
response = requests.get(image_url)
filename = f'./downloaded_images/{search_term}_{page_index}_{img_index}.jpg'
with open(filename, 'wb') as file:
file.write(response.content)
print(f'Page {page_index}, Image {img_index} downloaded')
except Exception:
print('Download error encountered')
continue
pages_to_scrape: Number of search result pages to process.images_per_page: Images to retrieve per page.search_term: Keyword for image search.
Install requierd libraries: pip install baiduspider requests. Ensure a downloaded_images directory exists in the script's location.
Image Deduplication Using MD5 Hashing
To remove duplicate images, compute MD5 hashes for files in a directory.
import os
import hashlib
directory_path = 'image_collection/'
def calculate_md5(file_path):
hash_md5 = hashlib.md5()
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def remove_duplicates():
seen_hashes = {}
for root, dirs, files in os.walk(directory_path):
for filename in files:
full_path = os.path.join(root, filename)
file_hash = calculate_md5(full_path)
if file_hash in seen_hashes:
os.remove(full_path)
else:
seen_hashes[file_hash] = full_path
remove_duplicates()
Place images in image_collection/. The script deletes duplicates based on identical MD5 hashes.
QR Code Generation and Recognition
Generating QR Codes with MyQR
from MyQR import myqr
import os
myqr.run(
words='https://www.example.com',
picture='logo.png',
colorized=True,
save_name='qrcode_output.png',
save_dir=os.getcwd()
)
Install: pip install MyQR. This creates a QR code embedding a URL and optional image.
Reading QR Codes with Pyzbar
from pyzbar.pyzbar import decode
import cv2
image_path = 'qrcode_output.png'
img = cv2.imread(image_path)
decoded_objects = decode(img)
for obj in decoded_objects:
bbox = obj.rect
cv2.rectangle(img, (bbox.left, bbox.top), (bbox.left + bbox.width, bbox.top + bbox.height), (0, 0, 255), 2)
data = obj.data.decode('utf-8')
print(f'Decoded data: {data}')
cv2.imshow('QR Code', img)
cv2.waitKey(0)
cv2.destroyAllWindows()
Install: pip install pyzbar opencv-python. This decodes QR codes and displays bounding boxes.
Opening URLs in Default Browser
Use Python's built-in webbrowser module.
import webbrowser
webbrowser.open('http://www.example.com')
This opens the URL in the system's default web browser.
Simulating Mouse Actions with Pynput
Create a script for autmoated mouse clicks using keyboard triggers.
import win32api
from pynput.mouse import Button, Controller
mouse_controller = Controller()
while True:
if win32api.GetAsyncKeyState(0x21) & 0x8000: # PgUp key
mouse_controller.click(Button.left)
if win32api.GetAsyncKeyState(0x22) & 0x8000: # PgDn key
break
Install: pip install pynput. This triggers left-clicks when PgUp is pressed and stops on PgDn.
Excel Data Manipulation with Xlwings
Filter and mark rows in an Excel file based on conditions.
import xlwings as xw
file_path = 'data.xlsx'
app = xw.App(visible=False, add_book=False)
app.display_alerts = False
app.screen_updating = False
workbook = app.books.open(file_path)
worksheet = workbook.sheets.active
for row in range(2, 8): # Assuming data starts at row 2
diameter = int(worksheet.range(f'A{row}').value)
material = worksheet.range(f'B{row}').value
age = int(worksheet.range(f'C{row}').value)
if diameter >= 30 and material == 'metal' and age >= 18:
worksheet.range(f'D{row}').value = 'GC2'
print(f'Row {row} processed')
workbook.save()
workbook.close()
app.quit()
Install: pip install xlwings. This updates Excel cells meeting specific criteria.
Generating Combinations with Itertools
Permutations with Repetition
Generate all possible sequences of length 3 from a set, allowing repeats and order matters.
import itertools
characters = ['a', 'b', 'c', 'd', 'e']
permutations = list(itertools.product(characters, repeat=3))
print(f'Total permutations: {len(permutations)}')
Combinations with out Repetition
Select unique subsets of size 3 where order does not matter.
combinations = list(itertools.combinations(characters, 3))
print(f'Total combinations: {len(combinations)}')
Extracting 7z Archives with Py7zr
Decompress password-protected 7z files.
import py7zr
with py7zr.SevenZipFile('archive.7z', mode='r', password='secret') as archive:
archive.extractall()
Install: pip install py7zr. This extracts contents to the current directory.
Text Comparision with Difflib
Compare two text files and generate an HTML diff report.
import difflib
import webbrowser
correct_file = 'reference.txt'
modified_file = 'modified.txt'
with open(correct_file, encoding='utf-8') as f:
correct_lines = f.readlines()
with open(modified_file, encoding='utf-8') as f:
modified_lines = f.readlines()
diff = difflib.HtmlDiff(wrapcolumn=50)
html_content = diff.make_file(correct_lines, modified_lines, fromdesc='Original', todesc='Modified')
with open('diff_report.html', 'w', encoding='utf-8') as f:
f.write(html_content)
webbrowser.open('diff_report.html')
This creates a visual comparison highlighting differences between files.