๐ณ Cookbook
Practical examples of common data transformation tasks using fimod. All examples are compatible with Monty's Python subset.
๐ Basic Transformations
๐ท๏ธ Renaming Keys
def transform(data, args, env, headers):
for row in data:
if "First Name" in row:
row["first_name"] = row["First Name"]
if "Age" in row:
row["age"] = int(row["Age"])
return data
๐๏ธ Data Structuring
๐ Flat CSV to Nested JSON
Convert flat rows into a structured object indexed by ID.
Input (CSV):
Script:
def transform(data, args, env, headers):
result = {}
for row in data:
user_id = row["id"]
if user_id not in result:
result[user_id] = {"role": row["role"], "permissions": []}
result[user_id]["permissions"].append(row["permission"])
return result
Output (JSON):
{
"101": {"role": "admin", "permissions": ["read", "write"]},
"102": {"role": "user", "permissions": ["read"]}
}
๐งน Data Cleaning
๐ Masking Sensitive Data
def transform(data, args, env, headers):
for user in data:
if "email" in user:
parts = user["email"].split("@")
user["email"] = f"{parts[0][0]}***@{parts[1]}"
return data
๐งฝ Deduplication + Normalization
def transform(data, args, env, headers):
seen = {}
result = []
for row in data:
email = row["email"].strip().lower()
if email in seen:
continue
seen[email] = True
result.append({
"name": f"{row['first_name'].strip().title()} {row['last_name'].strip().title()}",
"email": email,
"department": (row.get("dept") or "unknown").upper(),
})
return result
๐ Aggregation
๐ Group by + Average
def transform(data, args, env, headers):
depts = {}
for e in data:
d = e["department"]
if d not in depts:
depts[d] = {"dept": d, "count": 0, "total": 0}
entry = depts[d]
entry["count"] = entry["count"] + 1
entry["total"] = entry["total"] + e["salary"]
result = []
for entry in depts.values():
result.append({
"dept": entry["dept"],
"count": entry["count"],
"avg_salary": entry["total"] / entry["count"]
})
return result
๐ Regex Recipes
๐ง Extract Email Addresses
def transform(data, args, env, headers):
return {"emails": re_findall(r"\w+@\w+\.\w+", data["text"])}
๐งฝ Normalize Whitespace
def transform(data, args, env, headers):
return {"cleaned": re_sub(r"\s+", " ", data["text"].strip())}
๐ Extract URLs
def transform(data, args, env, headers):
urls = re_findall(r"https?://[^\s]+", data["text"])
return {"urls": urls, "count": len(urls)}
๐ท๏ธ Parse Structured Strings
# Parse "KEY=VALUE" pairs from config text
def transform(data, args, env, headers):
pairs = re_findall(r"(\w+)=(\S+)", data["text"])
# With 2 capture groups, re_findall returns [["key","val"], ...]
result = {}
for pair in pairs:
result[pair[0]] = pair[1]
return result
Or with named groups:
def transform(data, args, env, headers):
result = {}
for line in data["text"].strip().split("\n"):
m = re_search(r"^(?P<key>\w+)=(?P<val>.+)$", line)
if m:
result[m["named"]["key"]] = m["named"]["val"]
return result
๐ Validate Patterns
# Check if values match expected patterns
def transform(data, args, env, headers):
for row in data:
phone = row.get("phone", "")
row["valid_phone"] = re_match(r"\+?\d{10,15}", phone) is not None
return data
๐ Log Analysis
๐ Filter Error Lines
๐ Count by Level
def transform(data, args, env, headers):
levels = {}
for line in data:
for level in ["ERROR", "WARN", "INFO", "DEBUG"]:
if level in line:
levels[level] = levels.get(level, 0) + 1
return levels
๐ Filter with Regex
# Lines with 4xx/5xx status codes
fimod s -i access.log --input-format lines \
-e '[l for l in data if re_search(r"\s[45]\d{2}\s", l)]'
๐ API & HTTP
Awesome: the input can be an HTTPS request! Just -i https://... and you're done.
๐ Nested API โ Flat CSV
Fetch, transform, and save as CSV in one command โ using JSONPlaceholder as a live public API:
fimod s -i https://jsonplaceholder.typicode.com/users \
-e '[{
"id": u["id"],
"name": u["name"],
"email": u["email"],
"city": dp_get(u, "address.city"),
"company": dp_get(u, "company.name")
} for u in data]' \
-o contacts.csv
dp_get safely navigates nested fields (address.city, company.name) without risking a KeyError.
๐ Get the Latest Release Tag
# Inspect the redirect to extract the version
fimod s -i https://github.com/pytgaen/fimod/releases/latest \
--input-format http --no-follow \
-e 'data["headers"]["location"].split("/")[-1]' --output-format txt
# โ v0.3.0
๐ Fetch + Re-parse with set_input_format()
# Get raw HTTP response, then parse the body as JSON
fimod s -i https://api.github.com/repos/pytgaen/fimod/releases/latest \
--input-format http \
-e 'set_input_format("json"); data["body"]' \
-e '{"tag": data["tag_name"], "date": data["published_at"]}'
๐ Parameterized Scripts (--arg)
๐ฏ Reusable Filter
# filter_by_field.py โ generic filter script
def transform(data, args, env, headers):
field = args["field"]
value = args["value"]
return [row for row in data if row.get(field) == value]
# Reuse with different parameters
fimod s -i users.json -m filter_by_field.py --arg field="role" --arg value="admin"
fimod s -i users.json -m filter_by_field.py --arg field="status" --arg value="active"
๐ข Threshold with Type Casting
def transform(data, args, env, headers):
limit = int(args["min_age"])
return [u for u in data if u["age"] > limit]
๐ท๏ธ Dynamic Prefix/Suffix
๐๏ธ Dotpath Access
๐ Read Nested Fields Safely
# dp_get avoids KeyError for missing/optional fields
def transform(data, args, env, headers):
city = dp_get(data, "address.city", "unknown")
country = dp_get(data, "address.country", "unknown")
last = dp_get(data, "items.-1") # last array element
return {"city": city, "country": country, "last_item": last}
โ๏ธ Set Nested Fields
# dp_set returns a new copy โ original is unchanged
def transform(data, args, env, headers):
data = dp_set(data, "meta.source", "fimod")
data = dp_set(data, "meta.version", "1")
return data
๐ Iteration Helpers
๐ Group Records by Field
# it_group_by takes a field name (string), not a lambda
def transform(data, args, env, headers):
return it_group_by(data, "department")
fimod s -i employees.json -m group.py --output-format json
# โ {"engineering": [...], "sales": [...]}
๐ Sort Records by Field
๐งน Deduplicate by Field
# Keep first occurrence, discard duplicates by email
def transform(data, args, env, headers):
return it_unique_by(data, "email")
๐ Flatten Nested Arrays
def transform(data, args, env, headers):
# data = [[1, 2], [3, [4, 5]]] โ [1, 2, 3, 4, 5]
return it_flatten(data)
#๏ธโฃ Hashing for Anonymisation
๐ Replace PII with SHA-256
# hash_pii.py
# fimod: input-format=csv, output-format=csv
def transform(data, args, env, headers):
for row in data:
row["email"] = hs_sha256(row["email"])
row["phone"] = hs_sha256(row["phone"])
return data
๐ Generate Stable IDs from Keys
def transform(data, args, env, headers):
for row in data:
key = f"{row['name']}|{row['dob']}"
row["id"] = hs_md5(key)
return data
โ
Validation with --check
๐ก๏ธ Validate a Config File
# validate_config.py
def transform(data, args, env, headers):
required = ["host", "port", "db"]
return all(k in data and data[k] for k in required)
fimod s -i config.json -m validate_config.py --check
if [ $? -ne 0 ]; then
echo "ERROR: config.json is missing required fields" >&2
exit 1
fi
๐ Assert an API Response is Non-Empty
# Exit 1 if response array is empty or null
curl -s https://jsonplaceholder.typicode.com/todos | \
fimod s --input-format json -e 'data and len(data) > 0' --check
๐ซ Data Generation with --no-input
๐๏ธ Generate a Fixture from Arguments
# gen_users.py
def transform(data, args, env, headers):
n = int(args["count"])
prefix = args.get("prefix", "user")
return [{"id": i, "name": f"{prefix}{i}", "active": True} for i in range(1, n + 1)]
๐ Emit a Timestamp Record
fimod s --no-input -e '{"generated_at": args["ts"], "env": args["env"]}' \
--arg ts="2024-01-15T12:00:00Z" --arg env="production"
๐ฅ Slurp Mode
๐ Merge Multiple JSON Files
# Each file is a single JSON object โ collect into an array
cat config-dev.json config-prod.json | fimod s --slurp -e 'data'
# โ [{"env": "dev", ...}, {"env": "prod", ...}]
๐๏ธ Merge Config Files (Base + Overrides)
# base.yaml (defaults) + prod.toml (overrides) โ merged JSON
fimod s -i base.yaml -i prod.toml --slurp -e '
def transform(data):
data[0].update(data[1])
return data[0]
' --output-format json
Note:
{**a, **b}anda | bare not supported in Monty. Usea.update(b)for dict merging.