R
R is a programming language and environment specifically designed for statistical computing and graphics. It's widely used in academia, research, and industry for data analysis, statistical modeling, and data visualization.
Core Features
Statistical Computing
- Built-in statistics: Comprehensive statistical functions
- Statistical models: Linear, nonlinear, and mixed-effects models
- Hypothesis testing: Extensive testing procedures
- Probability distributions: All major distributions included
- Time series analysis: ARIMA, VAR, and other methods
Data Structures
# Vectors
numbers <- c(1, 2, 3, 4, 5)
names <- c("Alice", "Bob", "Charlie")
# Data frames
data <- data.frame(
name = c("Alice", "Bob", "Charlie"),
age = c(25, 30, 35),
score = c(95, 87, 92)
)
# Lists
results <- list(
model = lm(score ~ age, data = data),
summary_stats = summary(data),
plots = list()
)
# Matrices
matrix_data <- matrix(1:12, nrow = 3, ncol = 4)Data Manipulation
Base R
# Data filtering and selection
filtered_data <- data[dataage > 25, ]
high_scores <- subset(data, score > 90)
# Aggregation
aggregate_results <- aggregate(score ~ department, data = employee_data, mean)
# Apply functions
means_by_column <- apply(numeric_data, 2, mean)
row_sums <- apply(numeric_data, 1, sum)dplyr (Tidyverse)
library(dplyr)
library(magrittr)
# Modern data manipulation
result <- employee_data %>%
filter(department == "Engineering") %>%
group_by(level) %>%
summarise(
avg_salary = mean(salary),
count = n(),
median_experience = median(years_experience)
) %>%
arrange(desc(avg_salary))
# Data transformation
transformed_data <- raw_data %>%
mutate(
log_value = log(value),
normalized_score = (score - mean(score)) / sd(score),
category_factor = as.factor(category)
) %>%
select(id, log_value, normalized_score, category_factor)tidyr for Data Reshaping
library(tidyr)
# Pivot operations
wide_data <- long_data %>%
pivot_wider(
names_from = variable,
values_from = value
)
long_data <- wide_data %>%
pivot_longer(
cols = c(var1, var2, var3),
names_to = "variable",
values_to = "value"
)
# Separate and unite columns
separated <- data %>%
separate(full_name, into = c("first", "last"), sep = " ")
united <- data %>%
unite("full_address", street, city, state, sep = ", ")Statistical Analysis
Descriptive Statistics
# Summary statistics
summary(data)
str(data)
# Custom summaries
descriptive_stats <- data %>%
summarise(
mean_value = mean(value, na.rm = TRUE),
median_value = median(value, na.rm = TRUE),
sd_value = sd(value, na.rm = TRUE),
q25 = quantile(value, 0.25, na.rm = TRUE),
q75 = quantile(value, 0.75, na.rm = TRUE),
correlation = cor(value1, value2, use = "complete.obs")
)Inferential Statistics
# T-tests
t_test_result <- t.test(group_a, group_b)
print(t_test_result)
# ANOVA
anova_model <- aov(response ~ factor1 * factor2, data = experimental_data)
summary(anova_model)
TukeyHSD(anova_model)
# Chi-square test
chi_square <- chisq.test(contingency_table)
print(chi_square)
# Correlation tests
cor_test <- cor.test(x, y, method = "pearson")
print(cor_test)Regression Analysis
# Linear regression
linear_model <- lm(dependent_var ~ independent_var1 + independent_var2,
data = dataset)
summary(linear_model)
# Model diagnostics
par(mfrow = c(2, 2))
plot(linear_model)
# Multiple regression with interaction
complex_model <- lm(y ~ x1 * x2 + x3 + I(x1^2), data = dataset)
anova(linear_model, complex_model) # Model comparison
# Logistic regression
logistic_model <- glm(binary_outcome ~ predictor1 + predictor2,
family = binomial, data = dataset)
summary(logistic_model)Data Visualization
Base R Graphics
# Basic plots
plot(x, y, main = "Scatter Plot", xlab = "X Variable", ylab = "Y Variable")
lines(x, fitted_values, col = "red", lwd = 2)
# Histograms and density plots
hist(datavariable, breaks = 30, main = "Distribution")
lines(density(datavariable), col = "red", lwd = 2)
# Box plots
boxplot(value ~ category, data = dataset, main = "Values by Category")
# Multiple plots
par(mfrow = c(2, 2))
plot(model1)
par(mfrow = c(1, 1)) # Resetggplot2
library(ggplot2)
# Grammar of graphics approach
p1 <- ggplot(data, aes(x = variable1, y = variable2)) +
geom_point(aes(color = category, size = weight)) +
geom_smooth(method = "lm", se = TRUE) +
scale_color_brewer(type = "qual", palette = "Set1") +
labs(
title = "Relationship between Variables",
subtitle = "With linear trend line",
x = "Independent Variable",
y = "Dependent Variable",
color = "Category"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = "bottom"
)
# Faceted plots
p2 <- ggplot(time_series_data, aes(x = date, y = value)) +
geom_line(aes(color = series)) +
facet_wrap(~ region, scales = "free_y") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Save plots
ggsave("analysis_plot.png", p1, width = 10, height = 6, dpi = 300)Interactive Visualizations
library(plotly)
library(DT)
# Interactive scatter plot
interactive_plot <- plot_ly(
data = dataset,
x = ~x_var,
y = ~y_var,
color = ~category,
size = ~size_var,
text = ~paste("ID:", id, "<br>Value:", value),
hovertemplate = "%{text}<extra></extra>"
) %>%
add_markers() %>%
layout(
title = "Interactive Scatter Plot",
xaxis = list(title = "X Variable"),
yaxis = list(title = "Y Variable")
)
# Interactive data table
datatable(
dataset,
options = list(
pageLength = 25,
searchHighlight = TRUE,
scrollX = TRUE
),
filter = "top"
)Advanced Statistical Methods
Time Series Analysis
library(forecast)
library(tseries)
# Time series decomposition
ts_data <- ts(monthly_data, start = c(2020, 1), frequency = 12)
decomposition <- decompose(ts_data)
plot(decomposition)
# ARIMA modeling
auto_arima <- auto.arima(ts_data)
summary(auto_arima)
# Forecasting
forecast_result <- forecast(auto_arima, h = 12)
plot(forecast_result)
# Seasonal adjustment
seasonally_adjusted <- seasadj(decomposition)Machine Learning
library(caret)
library(randomForest)
library(e1071)
# Data preparation
set.seed(123)
train_index <- createDataPartition(datatarget, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
# Random Forest
rf_model <- randomForest(
target ~ .,
data = train_data,
ntree = 500,
importance = TRUE
)
# Variable importance
importance(rf_model)
varImpPlot(rf_model)
# Predictions and evaluation
predictions <- predict(rf_model, test_data)
confusionMatrix(predictions, test_datatarget)
# Cross-validation
cv_control <- trainControl(method = "cv", number = 10)
cv_model <- train(
target ~ .,
data = train_data,
method = "rf",
trControl = cv_control
)Package Management
CRAN Packages
# Install packages
install.packages(c("dplyr", "ggplot2", "tidyr"))
# Load packages
library(dplyr)
library(ggplot2)
# Check installed packages
installed.packages()
# Update packages
update.packages()Development Packages
# Install from GitHub
devtools::install_github("username/package_name")
# Install from Bioconductor
BiocManager::install("package_name")
# Package documentation
help(package = "dplyr")
vignette("dplyr")R Markdown & Reporting
---
title: "Data Analysis Report"
author: "Data Scientist"
date: "`r Sys.Date()`"
output:
html_document:
toc: true
toc_float: true
theme: flatly
---
# Analysis Overview
This report presents findings from our data analysis.
```{r setup, include=FALSE}
knitr::opts_chunkset(echo = TRUE, warning = FALSE, message = FALSE)
library(dplyr)
library(ggplot2)# Load and analyze data
data <- read.csv("data.csv")
summary_stats <- data %>%
group_by(category) %>%
summarise(
mean_value = mean(value),
count = n()
)
knitr::kable(summary_stats, caption = "Summary Statistics by Category")# Create visualization
ggplot(data, aes(x = category, y = value)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Value Distribution by Category")Conclusions
Based on the analysis, we found...
## Popular Packages
### Data Manipulation
- **dplyr**: Grammar of data manipulation
- **tidyr**: Tidy messy data
- **data.table**: Fast data manipulation
- **stringr**: String operations
### Visualization
- **ggplot2**: Grammar of graphics
- **plotly**: Interactive visualizations
- **lattice**: Trellis graphics
- **leaflet**: Interactive maps
### Statistical Analysis
- **caret**: Classification and regression training
- **forecast**: Time series forecasting
- **survival**: Survival analysis
- **lme4**: Mixed-effects models
### Specialized Domains
- **Bioconductor**: Bioinformatics packages
- **quantmod**: Quantitative financial modeling
- **sp/sf**: Spatial data analysis
- **tm**: Text mining
## Best Practices
### Code Organization
1. **Project structure**: Use RStudio projects
2. **Reproducibility**: Set random seeds and document versions
3. **Documentation**: Comment code and create README files
4. **Version control**: Use Git for tracking changes
### Data Analysis
1. **Exploratory analysis**: Always explore data first
2. **Data validation**: Check for outliers and missing values
3. **Assumptions**: Verify statistical assumptions
4. **Visualization**: Create informative plots
### Programming Style
```r
# Good practices
clean_data <- function(raw_data) {
# Clear function purpose and documentation
"""
Clean and prepare raw data for analysis
Args:
raw_data: Raw dataset from source
Returns:
Cleaned dataset ready for analysis
"""
cleaned <- raw_data %>%
filter(!is.na(key_variable)) %>%
mutate(
standardized_name = str_to_lower(str_trim(name)),
date_formatted = as.Date(date_string, format = "%Y-%m-%d")
) %>%
select(-temporary_columns)
return(cleaned)
}When to Choose R
Ideal For
- Statistical analysis and modeling
- Data exploration and visualization
- Academic research and publications
- Bioinformatics and life sciences
- Financial analysis and risk modeling
- Survey data analysis
- Reproducible research reports
Consider Alternatives When
- Production web applications
- Real-time systems
- Mobile app development
- Large-scale distributed computing
- Memory-intensive applications
Learning Resources
Books
- R for Data Science: Tidyverse approach
- The Art of R Programming: Programming fundamentals
- Advanced R: Deep dive into R internals
- Statistical Learning with R: Machine learning focus
Online Resources
- CRAN documentation: Comprehensive package docs
- RStudio resources: Tutorials and guides
- R-bloggers: Community blog aggregator
- Stack Overflow: Q&A community
Industry Applications
R is widely used in academia, pharmaceuticals, finance, and research organizations. Companies like Google, Microsoft, and IBM have embraced R for statistical computing and data science applications.
The language continues to evolve with strong support for modern data science workflows, making it an excellent choice for statistical analysis, research, and data-driven decision making.