Data Masking and Redaction


Introduction

Data masking protects sensitive information by replacing it with realistic but fictional data. Unlike encryption, masked data is permanently de-identified — it cannot be reversed to recover the original value. Organizations use masking for development, testing, analytics, and compliance with privacy regulations like GDPR and CCPA.

Static Data Masking

Static masking creates a sanitized copy of a production database for non-production use.




import hashlib


import random


import string




class StaticDataMasker:


def __init__(self, seed=42):


self.seed = seed


self.rng = random.Random(seed)




def mask_email(self, email):


"""Generate a consistent fake email from the real one."""


local, domain = email.split('@')


hash_obj = hashlib.sha256(email.encode())


fake_local = hash_obj.hexdigest()[:12]


return f"{fake_local}@masked-domain.com"




def mask_phone(self, phone):


"""Mask phone number, keeping format but replacing digits."""


masked = []


for char in phone:


if char.isdigit():


masked.append(str(self.rng.randint(0, 9)))


else:


masked.append(char)


return ''.join(masked)




def mask_credit_card(self, cc_number):


"""Mask all but last 4 digits."""


clean = cc_number.replace(' ', '').replace('-', '')


if len(clean) >= 4:


masked = '*' * (len(clean) - 4) + clean[-4:]


else:


masked = clean


return masked




def mask_name(self, name):


"""Replace name with a fake name."""


first_names = ['John', 'Jane', 'Alex', 'Sarah', 'Michael']


last_names = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones']




parts = name.split()


if len(parts) >= 2:


return f"{self.rng.choice(first_names)} {self.rng.choice(last_names)}"


return self.rng.choice(first_names)





SQL-Based Static Masking




-- PostgreSQL static masking


CREATE TABLE users_masked AS


SELECT


id,


md5(email) || '@masked.com' AS email,


'***-***-' || RIGHT(phone, 4) AS phone,


CASE WHEN position(' ' IN full_name) > 0 THEN


'User ' || id::text


ELSE


full_name


END AS full_name,


encode(sha256(ssn::bytea), 'hex') AS ssn


FROM users_production;




-- Consistent masking across tables


UPDATE customers SET


email = 'customer_' || id || '@example.com',


phone = CONCAT('555-', LPAD((id % 10000)::text, 4, '0')),


credit_card = CONCAT('XXXX-XXXX-XXXX-', RIGHT(credit_card, 4));





Dynamic Data Masking

Dynamic masking applies real-time transformations to query results without modifying the underlying data.

PostgreSQL Dynamic Masking




-- Create a masked view


CREATE VIEW users_redacted AS


SELECT


id,


CASE


WHEN current_user = 'admin' THEN email


ELSE '***@***.com'


END AS email,


CASE


WHEN current_user = 'admin' THEN phone


ELSE regexp_replace(phone, '\d(?=\d{4})', '*', 'g')


END AS phone,


CASE


WHEN current_user IN ('admin', 'support') THEN full_name


ELSE CONCAT(LEFT(full_name, 1), '***')


END AS full_name


FROM users;




-- Grant access to masked view


GRANT SELECT ON users_redacted TO app_user;


GRANT SELECT ON users_redacted TO support_agent;





Application-Level Dynamic Masking




from functools import wraps




def mask_sensitive_fields(fields_to_mask):


"""Decorator to dynamically mask sensitive fields in API responses."""


def decorator(func):


@wraps(func)


def wrapper(*args, **kwargs):


result = func(*args, **kwargs)




if isinstance(result, dict):


for field in fields_to_mask:


if field in result:


value = result[field]


if field in ('email', 'email_address'):


local, domain = value.split('@')


result[field] = f"{local[0]}***@{domain}"


elif field in ('phone', 'phone_number'):


result[field] = f"***-***-{value[-4:]}"


elif field in ('ssn', 'social_security'):


result[field] = f"***-**-{value[-4:]}"


elif 'card' in field.lower():


result[field] = f"****-****-****-{value[-4:]}"




return result


return wrapper


return decorator




@mask_sensitive_fields(['email', 'phone', 'credit_card'])


def get_user_profile(user_id):


# Returns full data; masking applied by decorator


return {


'user_id': user_id,


'email': 'john.doe@example.com',


'phone': '555-123-4567',


'credit_card': '4111-1111-1111-1111',


}





Tokenization

Tokenization replaces sensitive data with non-sensitive placeholders (tokens) while storing the mapping in a secure vault.




class TokenizationService:


def __init__(self, vault_client):


self.vault = vault_client


self.token_prefix = "tok_"




def tokenize(self, sensitive_value, context):


"""Replace sensitive value with a token."""


# Generate unique token


token_id = secrets.token_hex(16)


token = f"{self.token_prefix}{token_id}"




# Store mapping in secure vault


self.vault.store(


f"tokens/{token}",


{


'value': sensitive_value,


'context': context,


'created_at': datetime.utcnow().isoformat(),


'access_count': 0


}


)




return token




def detokenize(self, token, requester_role):


"""Retrieve original value from token (if authorized)."""


if not token.startswith(self.token_prefix):


raise ValueError("Invalid token format")




if requester_role not in ['admin', 'auditor', 'compliance']:


raise PermissionError("Not authorized to detokenize")




record = self.vault.retrieve(f"tokens/{token}")




# Increment access counter


record['access_count'] += 1


record['last_accessed'] = datetime.utcnow().isoformat()


self.vault.store(f"tokens/{token}", record)




# Log access


self._audit_log('detokenize', token, requester_role)




return record['value']





GDPR Compliance




from datetime import datetime, timedelta




class GDPRDataProcessor:


RETENTION_PERIODS = {


'user_profile': timedelta(days=365),


'transaction_log': timedelta(days=730),


'session_log': timedelta(days=90),


'analytics': timedelta(days=180),


}




def mask_for_export(self, user_data):


"""GDPR Article 20: data portability with masking."""


masked = {


'basic_info': {


'email': user_data['email'],


'username': user_data['username']


},


'transactions': [


{


'date': t['date'],


'amount': t['amount'],


'reference': f"REF-{hashlib.sha256(t['reference'].encode()).hexdigest()[:8]}"


}


for t in user_data.get('transactions', [])


],


'communications': [


{


'date': c['date'],


'type': c['type'],


'content_snippet': c['content'][:100] if c.get('content') else None


}


for c in user_data.get('communications', [])


]


}


return masked




def delete_user_data(self, user_id, databases):


"""GDPR Article 17: right to erasure."""


deletion_log = []


for db in databases:


try:


db.delete_user(user_id)


deletion_log.append({


'database': db.name,


'status': 'deleted',


'timestamp': datetime.utcnow().isoformat()


})


except Exception as e:


deletion_log.append({


'database': db.name,


'status': 'failed',


'error': str(e)


})


return deletion_log





Conclusion

Data masking is essential for privacy compliance and reducing the risk of data exposure. Use static masking for non-production environments, dynamic masking for real-time access control, and tokenization for scenarios requiring reversible de-identification. Always log access to sensitive data, enforce role-based masking policies, and ensure masking is consistent across all data stores and applications.