Easy web publishing from R
Write
R Markdown
documents in RStudio.
Share them here on RPubs.
(It’s free, and couldn’t be simpler!)
Get Started
Recently Published
week7datadive
This is my week 7 data dive where I conduct hypothesis testing using Fisher Significance Framework (FG% and shot distance) and Neyman-Pearson Framework (comparing FG% and position)
2010_CensoBR_Enrollment_rates_per_decile_GREEN_national_average.R
# 2010_CensoBR_Enrollment_rates_per_decile_GREEN_national_average.R
# ==============================================================================
# DESCRIPTION: TERTIARY EDUCATION ACCESS RATE BY INCOME DECILE
# Brazil Census 2010 | comparable to PNAD series (code 190_1)
#
# INDICATOR: Currently enrolled (V0628 ∈ {1,2} & V0629 ∈ {09-12})
# OR previously attended (V0633 ∈ {11-14})
# Ages 18-24 | Income: household per capita (V6531)
#
# VISUALIZATION:
# - Bars per decile with 95% CI (svyciprop logit)
# - National average in green (same style as PNAD code)
# - Labels and aesthetics in English
# ==============================================================================
library(censobr)
library(arrow)
library(dplyr)
library(Hmisc)
library(survey)
library(plotly)
# --- VISUALIZATION SETTINGS (mirror of PNAD code) ---
COR_NATIONAL <- "#2ca02c" # Strong green — national average
CORES_DECIS <- colorRampPalette(c("#D7191C", "#FFFFBF", "#2B83BA"))(10)
SHOW_CI_NATIONAL <- TRUE # Set FALSE to clean up
SHOW_CI_DECILES <- TRUE # Set FALSE to clean up
# ── 1. NATIONAL INCOME DECILE BREAKS ─────────────────────────────────────────
pop_renda <- read_population(
year = 2010,
columns = c("V6531", "V0010"),
add_labels = NULL,
showProgress = FALSE
) |>
filter(!is.na(V6531)) |>
collect()
q_breaks_full <- wtd.quantile(
x = pop_renda$V6531,
weights = pop_renda$V0010,
probs = seq(0, 1, 0.1),
na.rm = TRUE
)
cat("Income decile breaks (R$):\n"); print(round(q_breaks_full, 2))
rm(pop_renda)
# ── 2. LOAD DATA & FILTER AGES 18-24 ─────────────────────────────────────────
# V6036 = age in completed years
# V0628 = currently attending school (1=public, 2=private)
# V0629 = current course (09-12 = tertiary)
# V0633 = highest course ever attended (11-14 = tertiary)
# V6531 = household per capita income (R$, July 2010)
# V0010 = sampling weight
df_jovens <- read_population(
year = 2010,
columns = c("V6036", "V0628", "V0629", "V0633", "V6531", "V0010"),
add_labels = NULL,
showProgress = TRUE
) |>
filter(
as.integer(V6036) >= 18L,
as.integer(V6036) <= 24L,
!is.na(V6531)
) |>
collect()
# ── 3. BUILD INDICATOR + ASSIGN DECILES ──────────────────────────────────────
#
# Equivalence with PNAD code:
# ens_sup == 1 → currently enrolled → V0628 ∈ {1,2} & V0629 ∈ {09-12}
# ens_sup_a == 1 → previously attended → V0633 ∈ {11-14}
df_jovens <- df_jovens |>
mutate(
decil_renda = cut(
V6531,
breaks = q_breaks_full,
labels = paste0("D", 1:10),
include.lowest = TRUE
),
freq_current = !is.na(V0628) & V0628 %in% c("1", "2") &
!is.na(V0629) & V0629 %in% c("09", "10", "11", "12"),
freq_past = !is.na(V0633) & V0633 %in% c("11", "12", "13", "14"),
sup_total = as.integer(freq_current | freq_past)
) |>
filter(!is.na(decil_renda))
cat("Youth 18-24 in sample (with income):", nrow(df_jovens), "\n")
# ── 4. SURVEY DESIGN + COMPUTE RATES WITH 95% CI ─────────────────────────────
design_censo <- svydesign(ids = ~1, weights = ~V0010, data = df_jovens)
# By decile
taxa_decil <- svyby(
~sup_total, ~decil_renda, design_censo,
svyciprop, method = "logit", vartype = "ci"
) |>
mutate(
decil = as.character(decil_renda),
decil_num = as.numeric(gsub("D", "", decil)),
prop = sup_total * 100,
lower = ci_l * 100,
upper = ci_u * 100,
ano = 2010
)
# National average
taxa_nacional_raw <- svyciprop(~sup_total, design_censo,
method = "logit", vartype = "ci")
taxa_nacional <- data.frame(
decil = "National",
decil_num = 0,
prop = as.numeric(taxa_nacional_raw) * 100,
lower = attr(taxa_nacional_raw, "ci")[1] * 100,
upper = attr(taxa_nacional_raw, "ci")[2] * 100,
ano = 2010
)
cat("\nNational access rate — Census 2010 (ages 18-24):",
sprintf("%.1f%%", taxa_nacional$prop),
"[ 95% CI:", sprintf("%.1f", taxa_nacional$lower), "–",
sprintf("%.1f%%", taxa_nacional$upper), "]\n")
# ── 5. DECILE LABELS (mirror of PNAD code) ───────────────────────────────────
labels_decis <- c(
"Decile 1: bottom 10%",
"Decile 2: 10-20%",
"Decile 3: 20-30%",
"Decile 4: 30-40%",
"Decile 5: 40-50%",
"Decile 6: 50-60%",
"Decile 7: 60-70%",
"Decile 8: 70-80%",
"Decile 9: 80-90%",
"Decile 10: top 10%"
)
# ── 6. PLOTLY CHART ───────────────────────────────────────────────────────────
fig_2010_censo <- plot_ly()
# 6A) DECILE BARS (bottom layer)
for (i in 1:10) {
df_d <- taxa_decil |> filter(decil_num == i)
fig_2010_censo <- fig_2010_censo |>
add_trace(
data = df_d,
x = ~decil_num,
y = ~prop,
type = "bar",
name = labels_decis[i],
marker = list(
color = CORES_DECIS[i],
line = list(color = "white", width = 0.8)
),
error_y = list(
type = "data",
array = ~upper - prop,
arrayminus = ~prop - lower,
visible = SHOW_CI_DECILES,
thickness = 1.5,
width = 4,
color = "gray40"
)
# ,
# hoverinfo = "text",
# text = ~paste0(
# "<b>", labels_decis[i], "</b><br>",
# "<b>Access Rate:</b> ", sprintf("%.1f", prop), "%<br>",
# "<b>CI (95%):</b> [", sprintf("%.1f", lower), "% – ",
# sprintf("%.1f", upper), "%]"
# ),
# showlegend = TRUE
)
}
# 6B) NATIONAL AVERAGE LINE (top layer — same style as PNAD code)
fig_2010_censo <- fig_2010_censo |>
add_trace(
data = taxa_nacional,
x = list(0.5, 10.5), # horizontal line across full chart
y = list(taxa_nacional$prop, taxa_nacional$prop),
type = "scatter",
mode = "lines",
name = paste0("<b>National Average: ",
sprintf("%.1f", taxa_nacional$prop), "%</b>"),
line = list(
color = COR_NATIONAL,
width = 3,
dash = "dash"
),
hoverinfo = "text",
text = paste0(
"<b>★ NATIONAL AVERAGE (18-24) ★</b><br>",
"<b>Access Rate:</b> ", sprintf("%.1f", taxa_nacional$prop), "%<br>",
"<b>CI (95%):</b> [", sprintf("%.1f", taxa_nacional$lower), "% – ",
sprintf("%.1f", taxa_nacional$upper), "%]"
),
showlegend = TRUE
)
# 6C) LAYOUT
fig_2010_censo <- fig_2010_censo |>
layout(
title = list(
text = "<b>Tertiary education access rate by income decile in Brazil, 2010</b>",
x = 0.5,
y = 0.97,
font = list(size = 20)
),
xaxis = list(
title = "<b>Income decile (D1 = poorest · D10 = richest)</b>",
tickmode = "array",
tickvals = 1:10,
ticktext = paste0("D", 1:10),
gridcolor = "#E5E5E5"
),
yaxis = list(
title = "<b>Access rate (%)</b>",
ticksuffix = "%",
range = c(0, 100),
gridcolor = "#E5E5E5"
),
legend = list(
x = 0.02,
y = 0.98,
xanchor = "left",
yanchor = "top",
bgcolor = "rgba(255,255,255,0.85)",
bordercolor = "#CCCCCC",
borderwidth = 1,
font = list(size = 9),
traceorder = "reversed"
),
annotations = list(
list(
x = 0.5,
y = -0.15,
text = paste0(
"Source: Prepared by the author based on Brazil's Demographic Census 2010 (IBGE).",
" Microdata accessed via {censobr} (IPEA).<br>",
"Tertiary access: currently enrolled (V0628 & V0629) OR previously attended (V0633).",
" Ages 18–24. 95% CI via svyciprop (logit)."
),
showarrow = FALSE,
xref = "paper",
yref = "paper",
xanchor = "center",
yanchor = "top",
font = list(size = 11, color = "#555555")
)
),
barmode = "group",
margin = list(t = 90, b = 130, l = 60, r = 40),
plot_bgcolor = "white",
paper_bgcolor = "white",
hovermode = "closest"
)
fig_2010_censo
cat("\n✅ Chart generated: 2010_CensoBR_Enrollment_rates_per_decile_GREEN_national_average\n")
cat("\nADJUSTABLE SETTINGS (lines 20-23):\n")
cat(" • COR_NATIONAL: ", COR_NATIONAL, "\n", sep = "")
cat(" • SHOW_CI_NATIONAL: ", SHOW_CI_NATIONAL, "\n", sep = "")
cat(" • SHOW_CI_DECILES: ", SHOW_CI_DECILES, "\n", sep = "")
Integrating PCA and Cluster Analysis for Strategic Crime Fingerprinting
How can the LAPD identify unique crime ‘fingerprints’ by integrating spatial hotspots, temporal patterns, and victim demographics to transition from reactive to proactive policing? This analysis aims to move from broad patrolling to Precision Policing. By segmenting crimes into specific “Risk Fingerprints,” we provide the LAPD with actionable intelligence to deploy specialized units where they are most needed.
220_3_PNAD_Income_per_decile_INDIVIDUAL_TOGGLE_AGGREGATES
# 220_3_PNAD_Income_per_decile_INDIVIDUAL_TOGGLE_AGGREGATES
#==============================================================================
# MELHORIAS EM RELAÇÃO AO 190_2:
# 1) Cada decil é independente na legenda (sem legendgroup compartilhado)
# 2) Grupos agregados: Bottom 40%, Bottom 50%, Bottom 80%
# → Renda média ponderada do grupo inteiro por ano
# 3) Grupos agregados com linha tracejada para distinguir dos decis
#==============================================================================
library(dplyr)
library(survey)
library(Hmisc)
library(plotly)
# --- CONFIGURAÇÕES DE VISUALIZAÇÃO ---
COR_MEDIA_NACIONAL <- "#2ca02c"
LARGURA_MEDIA_NACIONAL <- 4
TAMANHO_MARCADOR_MEDIA <- 12
MOSTRAR_IC_MEDIA_NACIONAL <- TRUE
# Grupos agregados: nome, decis incluídos, cor
GRUPOS_AGREGADOS <- list(
list(nome = "<b>Bottom 40%</b> (D1–D4)", decis = 1:4, cor = "#e377c2", dash = "dash"),
list(nome = "<b>Bottom 50%</b> (D1–D5)", decis = 1:5, cor = "#ff7f0e", dash = "dashdot"),
list(nome = "<b>Bottom 80%</b> (D1–D8)", decis = 1:8, cor = "#9467bd", dash = "dot")
)
# =============================================================================
# 1) PREPARAÇÃO DOS DECIS NACIONAIS
# =============================================================================
pop_decis <- Salata_etal_2025_dataset %>%
group_by(ano) %>%
mutate(
decil_renda = cut(
renda_dom_pcta,
breaks = Hmisc::wtd.quantile(renda_dom_pcta, weights = peso, probs = seq(0, 1, 0.1)),
labels = paste0("D", 1:10),
include_lowest = TRUE
)
) %>%
ungroup() %>%
filter(!is.na(decil_renda)) %>%
mutate(decil_num = as.numeric(gsub("D", "", decil_renda)))
anos_disponiveis <- sort(unique(pop_decis$ano))
# =============================================================================
# 2) CÁLCULO: DECIS + NACIONAL + GRUPOS AGREGADOS
# =============================================================================
lista_decis <- list()
lista_nacional <- list()
# Uma lista por grupo agregado
lista_agr <- lapply(seq_along(GRUPOS_AGREGADOS), function(x) list())
for (a in anos_disponiveis) {
df_ano <- pop_decis %>% filter(ano == a)
design_pop <- svydesign(ids = ~1, weights = ~peso, data = df_ano)
# --- Por decil ---
renda_decil <- svyby(~renda_dom_pcta, ~decil_renda, design_pop, svymean, vartype = "ci")
renda_decil <- renda_decil %>%
mutate(ano = a, decil = as.character(decil_renda),
media = renda_dom_pcta, lower = ci_l, upper = ci_u)
lista_decis[[as.character(a)]] <- renda_decil
# --- Nacional ---
m_nac <- svymean(~renda_dom_pcta, design_pop)
ci_nac <- confint(m_nac)
lista_nacional[[as.character(a)]] <- data.frame(
ano = a,
media = as.numeric(m_nac["renda_dom_pcta"]),
lower = ci_nac["renda_dom_pcta", "2.5 %"],
upper = ci_nac["renda_dom_pcta", "97.5 %"]
)
# --- Grupos agregados ---
for (g in seq_along(GRUPOS_AGREGADOS)) {
df_grupo <- df_ano %>% filter(decil_num %in% GRUPOS_AGREGADOS[[g]]$decis)
design_grupo <- svydesign(ids = ~1, weights = ~peso, data = df_grupo)
m_g <- svymean(~renda_dom_pcta, design_grupo)
ci_g <- confint(m_g)
lista_agr[[g]][[as.character(a)]] <- data.frame(
ano = a,
media = as.numeric(m_g["renda_dom_pcta"]),
lower = ci_g["renda_dom_pcta", "2.5 %"],
upper = ci_g["renda_dom_pcta", "97.5 %"]
)
}
}
dados_decis <- bind_rows(lista_decis) %>%
mutate(decil_num = as.numeric(gsub("D", "", decil)))
dados_nacional <- bind_rows(lista_nacional)
dados_agr <- lapply(lista_agr, bind_rows)
# =============================================================================
# 3) RÓTULOS E CORES
# =============================================================================
labels_decis <- c(
"Decile 1: bottom 10%", "Decile 2: 10–20%", "Decile 3: 20–30%",
"Decile 4: 30–40%", "Decile 5: 40–50%", "Decile 6: 50–60%",
"Decile 7: 60–70%", "Decile 8: 70–80%", "Decile 9: 80–90%",
"Decile 10: top 10%"
)
cores <- colorRampPalette(c("#D7191C", "#FFFFBF", "#2B83BA"))(10)
# =============================================================================
# 4) CONSTRUÇÃO DO GRÁFICO
# =============================================================================
fig_220_3 <- plot_ly()
# ── 4A) DECIS INDIVIDUAIS ─────────────────────────────────────────────────────
# CHAVE: SEM legendgroup compartilhado → cada trace é independente na legenda
for (i in 1:10) {
df_d <- dados_decis %>% filter(decil_num == i)
fig_220_3 <- fig_220_3 %>%
add_trace(
data = df_d, x = ~ano, y = ~media,
type = 'scatter', mode = 'lines+markers',
name = labels_decis[i],
line = list(color = cores[i], width = 2.5),
marker = list(color = cores[i], size = 7),
error_y = list(
type = "data", array = ~upper - media, arrayminus = ~media - lower,
visible = TRUE, thickness = 1, width = 2, color = cores[i]
),
hoverinfo = "text",
text = ~paste0(
"<b>Year:</b> ", ano, "<br>",
"<b>", labels_decis[i], "</b><br>",
"<b>Mean Income:</b> R$ ", formatC(media, format = "f", digits = 0, big.mark = ","), "<br>",
"<b>CI (95%):</b> [R$ ", formatC(lower, format = "f", digits = 0, big.mark = ","),
" – R$ ", formatC(upper, format = "f", digits = 0, big.mark = ","), "]"
),
# ↓ legendgroup ÚNICO por decil → toggle individual
legendgroup = paste0("decil_", i),
showlegend = TRUE
)
}
# ── 4B) GRUPOS AGREGADOS (linha tracejada, marcador triângulo) ────────────────
for (g in seq_along(GRUPOS_AGREGADOS)) {
grp <- GRUPOS_AGREGADOS[[g]]
df_g <- dados_agr[[g]]
fig_220_3 <- fig_220_3 %>%
add_trace(
data = df_g, x = ~ano, y = ~media,
type = 'scatter', mode = 'lines+markers',
name = grp$nome,
line = list(color = grp$cor, width = 3, dash = grp$dash),
marker = list(color = grp$cor, size = 9, symbol = "triangle-up",
line = list(color = "white", width = 1)),
error_y = list(
type = "data", array = ~upper - media, arrayminus = ~media - lower,
visible = TRUE, thickness = 1.5, width = 3, color = grp$cor
),
hoverinfo = "text",
text = ~paste0(
"<b>Year:</b> ", ano, "<br>",
"<b>", grp$nome, "</b><br>",
"<b>Mean Income:</b> R$ ", formatC(media, format = "f", digits = 0, big.mark = ","), "<br>",
"<b>CI (95%):</b> [R$ ", formatC(lower, format = "f", digits = 0, big.mark = ","),
" – R$ ", formatC(upper, format = "f", digits = 0, big.mark = ","), "]"
),
legendgroup = paste0("agregado_", g),
showlegend = TRUE
)
}
# ── 4C) MÉDIA NACIONAL (camada superior) ──────────────────────────────────────
fig_220_3 <- fig_220_3 %>%
add_trace(
data = dados_nacional, x = ~ano, y = ~media,
type = 'scatter', mode = 'lines+markers',
name = "<b>National Average</b>",
line = list(color = COR_MEDIA_NACIONAL, width = LARGURA_MEDIA_NACIONAL),
marker = list(color = COR_MEDIA_NACIONAL, size = TAMANHO_MARCADOR_MEDIA,
symbol = "diamond", line = list(color = "white", width = 1.5)),
error_y = list(
type = "data", array = ~upper - media, arrayminus = ~media - lower,
visible = MOSTRAR_IC_MEDIA_NACIONAL, thickness = 2, width = 3,
color = COR_MEDIA_NACIONAL
),
hoverinfo = "text",
text = ~paste0(
"<b>Year:</b> ", ano, "<br>",
"<b>★ NATIONAL AVERAGE ★</b><br>",
"<b>Mean Income:</b> R$ ", formatC(media, format = "f", digits = 0, big.mark = ","), "<br>",
"<b>CI (95%):</b> [R$ ", formatC(lower, format = "f", digits = 0, big.mark = ","),
" – R$ ", formatC(upper, format = "f", digits = 0, big.mark = ","), "]"
),
legendgroup = "nacional",
showlegend = TRUE
)
# =============================================================================
# 5) LAYOUT
# =============================================================================
fig_220_3 <- fig_220_3 %>%
layout(
title = list(
text = "<b>Mean per capita household income by income decile in Brazil, 1992–2022</b>",
x = 0.5, y = 0.97, font = list(size = 20)
),
xaxis = list(
title = "<b>Year</b>", gridcolor = "#E5E5E5",
tickmode = "array", tickvals = anos_disponiveis,
tickangle = -45, font = list(size = 10)
),
yaxis = list(
title = "<b>Mean per capita household income (BRL)</b>",
tickprefix = "R$ ", gridcolor = "#E5E5E5"
),
legend = list(
x = 0.01, y = 0.99,
xanchor = "left", yanchor = "top",
bgcolor = 'rgba(255,255,255,0.85)',
bordercolor = "#CCCCCC", borderwidth = 1,
font = list(size = 9),
traceorder = "reversed"
),
annotations = list(list(
x = 0.5, y = -0.15,
text = "Source: Prepared by the author based on PNAD/PNAD-C (IBGE) data, harmonized by Salata et al. (2025).<br>Article, code and data available at doi:10.7910/DVN/UZ2SHW",
showarrow = FALSE, xref = 'paper', yref = 'paper',
xanchor = 'center', yanchor = 'top',
font = list(size = 14, color = "#555555")
)),
margin = list(t = 90, b = 120, l = 80, r = 40),
plot_bgcolor = "white",
paper_bgcolor = "white",
hovermode = "closest"
)
fig_220_3
cat("\n✅ Gráfico 220_3 gerado!\n")
cat("\nMELHORIAS IMPLEMENTADAS:\n")
cat("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n")
cat("✓ Toggle INDIVIDUAL por decil (legendgroup único por trace)\n")
cat("✓ Bottom 40% (D1–D4) → rosa, linha tracejada, triângulo\n")
cat("✓ Bottom 50% (D1–D5) → laranja, dash-dot, triângulo\n")
cat("✓ Bottom 80% (D1–D8) → roxo, dotted, triângulo\n")
cat("✓ Agregados calculados com svymean() no subconjunto dos decis\n")
League of Legends Champion Clustering and Dimension Reduction Analysis
This project applies unsupervised learning to a dataset of League of Legends champion base statistics sourced from Kaggle (Cute Dango, League of Legends Champions dataset, available at kaggle.com/datasets/cutedango/league-of-legends-champions) to discover whether champions naturally cluster into distinct statistical archetypes, and which features drive those groupings.
The workflow combines three complementary approaches:
Hard clustering (K-Means, Hierarchical) to identify stable, discrete champion archetypes,
Soft clustering (Fuzzy C-Means) to quantify champion hybridity, how strongly each champion belongs to one archetype versus another,
Dimensionality reduction (PCA, MDS, UMAP, t-SNE, SOM) to visualize the structure of the feature space and validate clustering results across multiple independent methods.
Epi553_lab06_ShahiSuruchi
This is the completion of 6th lab for class Epi 553.