E-commerce User Behavior Analysis Using Spark SQL
With the rapid growth of e-commerce platforms, aanlyzing user purchasing behavior and preferences has become essential for business intelligence. This article demonstrates how to simulate user behavior data and perform analytics using Spark SQL.
Simulated Dataset Generation
The following Scala code generates synthetic user behavior data for analysis:
package com.analytics.ecommerce
import java.io.{FileWriter, BufferedWriter}
import java.text.SimpleDateFormat
import java.util.{Calendar, Date, Random}
object UserBehaviorGenerator {
def main(args: Array[String]): Unit = {
val outputBuffer = new StringBuilder()
val dateFormat = new SimpleDateFormat("yyyy-MM-dd")
val randomGenerator = new Random
val productCategories = Array(
"Electronics", "Clothing", "Books",
"Home Appliances", "Food",
"Sports", "Toys", "Beauty",
"Furniture", "Digital Media"
)
val userActions = Array(
"view", "purchase", "add_cart",
"select", "wishlist", "unlike"
)
val calendar = Calendar.getInstance()
calendar.setTime(new Date())
// Generate data for the past 20 days
for (day <- 1 to 20) {
calendar.add(Calendar.DAY_OF_YEAR, -1)
val targetDate = calendar.getTime
val dateString = dateFormat.format(targetDate)
// Generate 3000 user activity records per day
for (_ <- 0 until 3000) {
val timestamp = new Date().getTime
val userId = randomGenerator.nextInt(100000)
val pageId = randomGenerator.nextInt(10000)
val category = productCategories(randomGenerator.nextInt(10))
val action = userActions(randomGenerator.nextInt(6))
outputBuffer.append(s"$dateString,$timestamp,$userId,$pageId,$category,$action\n")
}
// Generate 10-20 new user registration records per day
val registrationCount = randomGenerator.nextInt(11) + 10
for (_ <- 0 until registrationCount) {
val timestamp = new Date().getTime
outputBuffer.append(s"$dateString,$timestamp,null,null,null,register\n")
}
}
// Write to HDFS or local file system
val writer = new BufferedWriter(new FileWriter("/path/to/output/user_behavior.log", true))
try {
writer.write(outputBuffer.toString)
} finally {
writer.close()
}
}
}
The dataset contains six fields: date, timestamp, userId, pageId, category, and action. New users have null values for userId, pageId, and category.
Spark SQL Table Creation
package com.analytics.ecommerce
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType}
object EcommerceAnalytics {
def main(args: Array[String]): Unit = {
val sparkConfig = new SparkConf()
.setMaster("local[6]")
.setAppName("ecommerce-analysis")
val spark = SparkSession.builder()
.config(sparkConfig)
.getOrCreate()
val dataSchema = StructType(Seq(
StructField("eventDate", StringType),
StructField("timestamp", LongType),
StructField("userId", IntegerType),
StructField("pageId", IntegerType),
StructField("category", StringType),
StructField("action", StringType)
))
val userBehaviorData = spark.read
.schema(dataSchema)
.option("delimiter", ",")
.option("header", false)
.csv("hdfs://namenode:9000/data/user_activity/*.log")
.repartition(4)
.cache()
userBehaviorData.createTempView("user_activity")
// Execute analysis queries
analyzeUserBehavior(spark, userBehaviorData)
}
def analyzeUserBehavior(spark: SparkSession, data: DataFrame): Unit = {
// Query implementations below
}
}
Data Analysis Queries
Finding Users with Most Category Views
data
.filter($"userId".isNotNull)
.groupBy($"userId")
.agg(count($"category").as("category_count"))
.orderBy($"category_count".desc)
.show(30)
Active Users in the Last 30 Days
data
.filter($"userId".isNotNull)
.where(datediff(current_timestamp(), $"eventDate") < 30)
.groupBy($"userId")
.agg(count($"userId").as("activity_count"))
.select($"userId", $"activity_count",
dense_rank().over(Window.orderBy($"activity_count".desc)).as("rank"))
.filter($"rank" < 3)
.show(1000)
Most Popular Product Categories
data
.filter($"userId".isNotNull)
.groupBy($"category")
.agg(count($"category").as("total_views"))
.select($"category", $"total_views",
dense_rank().over(Window.orderBy($"total_views".desc)).as("popularity_rank"))
.filter($"popularity_rank" <= 2)
.show(1000)
SQL Query Alternative
spark.sql("""
SELECT *
FROM user_activity
WHERE userId = 18617
""").show(10)
Practice Exercises
-
Calculate conversion rates: Identify the top 3 categories with the highest purchase-to-action ratio (purchases / total actions)
-
Analyze purchase behavior: Find users with the highest and lowest purchase conversion rates
-
Identify target customers: Determine high-value and low-value customers based on purchase frequency in the last quarter