Data Masking and Redaction
Introduction
Data masking protects sensitive information by replacing it with realistic but fictional data. Unlike encryption, masked data is permanently de-identified — it cannot be reversed to recover the original value. Organizations use masking for development, testing, analytics, and compliance with privacy regulations like GDPR and CCPA.
Static Data Masking
Static masking creates a sanitized copy of a production database for non-production use.
import hashlib
import random
import string
class StaticDataMasker:
def __init__(self, seed=42):
self.seed = seed
self.rng = random.Random(seed)
def mask_email(self, email):
"""Generate a consistent fake email from the real one."""
local, domain = email.split('@')
hash_obj = hashlib.sha256(email.encode())
fake_local = hash_obj.hexdigest()[:12]
return f"{fake_local}@masked-domain.com"
def mask_phone(self, phone):
"""Mask phone number, keeping format but replacing digits."""
masked = []
for char in phone:
if char.isdigit():
masked.append(str(self.rng.randint(0, 9)))
else:
masked.append(char)
return ''.join(masked)
def mask_credit_card(self, cc_number):
"""Mask all but last 4 digits."""
clean = cc_number.replace(' ', '').replace('-', '')
if len(clean) >= 4:
masked = '*' * (len(clean) - 4) + clean[-4:]
else:
masked = clean
return masked
def mask_name(self, name):
"""Replace name with a fake name."""
first_names = ['John', 'Jane', 'Alex', 'Sarah', 'Michael']
last_names = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones']
parts = name.split()
if len(parts) >= 2:
return f"{self.rng.choice(first_names)} {self.rng.choice(last_names)}"
return self.rng.choice(first_names)
SQL-Based Static Masking
-- PostgreSQL static masking
CREATE TABLE users_masked AS
SELECT
id,
md5(email) || '@masked.com' AS email,
'***-***-' || RIGHT(phone, 4) AS phone,
CASE WHEN position(' ' IN full_name) > 0 THEN
'User ' || id::text
ELSE
full_name
END AS full_name,
encode(sha256(ssn::bytea), 'hex') AS ssn
FROM users_production;
-- Consistent masking across tables
UPDATE customers SET
email = 'customer_' || id || '@example.com',
phone = CONCAT('555-', LPAD((id % 10000)::text, 4, '0')),
credit_card = CONCAT('XXXX-XXXX-XXXX-', RIGHT(credit_card, 4));
Dynamic Data Masking
Dynamic masking applies real-time transformations to query results without modifying the underlying data.
PostgreSQL Dynamic Masking
-- Create a masked view
CREATE VIEW users_redacted AS
SELECT
id,
CASE
WHEN current_user = 'admin' THEN email
ELSE '***@***.com'
END AS email,
CASE
WHEN current_user = 'admin' THEN phone
ELSE regexp_replace(phone, '\d(?=\d{4})', '*', 'g')
END AS phone,
CASE
WHEN current_user IN ('admin', 'support') THEN full_name
ELSE CONCAT(LEFT(full_name, 1), '***')
END AS full_name
FROM users;
-- Grant access to masked view
GRANT SELECT ON users_redacted TO app_user;
GRANT SELECT ON users_redacted TO support_agent;
Application-Level Dynamic Masking
from functools import wraps
def mask_sensitive_fields(fields_to_mask):
"""Decorator to dynamically mask sensitive fields in API responses."""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
result = func(*args, **kwargs)
if isinstance(result, dict):
for field in fields_to_mask:
if field in result:
value = result[field]
if field in ('email', 'email_address'):
local, domain = value.split('@')
result[field] = f"{local[0]}***@{domain}"
elif field in ('phone', 'phone_number'):
result[field] = f"***-***-{value[-4:]}"
elif field in ('ssn', 'social_security'):
result[field] = f"***-**-{value[-4:]}"
elif 'card' in field.lower():
result[field] = f"****-****-****-{value[-4:]}"
return result
return wrapper
return decorator
@mask_sensitive_fields(['email', 'phone', 'credit_card'])
def get_user_profile(user_id):
# Returns full data; masking applied by decorator
return {
'user_id': user_id,
'email': 'john.doe@example.com',
'phone': '555-123-4567',
'credit_card': '4111-1111-1111-1111',
}
Tokenization
Tokenization replaces sensitive data with non-sensitive placeholders (tokens) while storing the mapping in a secure vault.
class TokenizationService:
def __init__(self, vault_client):
self.vault = vault_client
self.token_prefix = "tok_"
def tokenize(self, sensitive_value, context):
"""Replace sensitive value with a token."""
# Generate unique token
token_id = secrets.token_hex(16)
token = f"{self.token_prefix}{token_id}"
# Store mapping in secure vault
self.vault.store(
f"tokens/{token}",
{
'value': sensitive_value,
'context': context,
'created_at': datetime.utcnow().isoformat(),
'access_count': 0
}
)
return token
def detokenize(self, token, requester_role):
"""Retrieve original value from token (if authorized)."""
if not token.startswith(self.token_prefix):
raise ValueError("Invalid token format")
if requester_role not in ['admin', 'auditor', 'compliance']:
raise PermissionError("Not authorized to detokenize")
record = self.vault.retrieve(f"tokens/{token}")
# Increment access counter
record['access_count'] += 1
record['last_accessed'] = datetime.utcnow().isoformat()
self.vault.store(f"tokens/{token}", record)
# Log access
self._audit_log('detokenize', token, requester_role)
return record['value']
GDPR Compliance
from datetime import datetime, timedelta
class GDPRDataProcessor:
RETENTION_PERIODS = {
'user_profile': timedelta(days=365),
'transaction_log': timedelta(days=730),
'session_log': timedelta(days=90),
'analytics': timedelta(days=180),
}
def mask_for_export(self, user_data):
"""GDPR Article 20: data portability with masking."""
masked = {
'basic_info': {
'email': user_data['email'],
'username': user_data['username']
},
'transactions': [
{
'date': t['date'],
'amount': t['amount'],
'reference': f"REF-{hashlib.sha256(t['reference'].encode()).hexdigest()[:8]}"
}
for t in user_data.get('transactions', [])
],
'communications': [
{
'date': c['date'],
'type': c['type'],
'content_snippet': c['content'][:100] if c.get('content') else None
}
for c in user_data.get('communications', [])
]
}
return masked
def delete_user_data(self, user_id, databases):
"""GDPR Article 17: right to erasure."""
deletion_log = []
for db in databases:
try:
db.delete_user(user_id)
deletion_log.append({
'database': db.name,
'status': 'deleted',
'timestamp': datetime.utcnow().isoformat()
})
except Exception as e:
deletion_log.append({
'database': db.name,
'status': 'failed',
'error': str(e)
})
return deletion_log
Conclusion
Data masking is essential for privacy compliance and reducing the risk of data exposure. Use static masking for non-production environments, dynamic masking for real-time access control, and tokenization for scenarios requiring reversible de-identification. Always log access to sensitive data, enforce role-based masking policies, and ensure masking is consistent across all data stores and applications.