R

R is a programming language and environment specifically designed for statistical computing and graphics. It's widely used in academia, research, and industry for data analysis, statistical modeling, and data visualization.

Core Features

Statistical Computing

Built-in statistics: Comprehensive statistical functions
Statistical models: Linear, nonlinear, and mixed-effects models
Hypothesis testing: Extensive testing procedures
Probability distributions: All major distributions included
Time series analysis: ARIMA, VAR, and other methods

Data Structures

# Vectors
numbers <- c(1, 2, 3, 4, 5)
names <- c("Alice", "Bob", "Charlie")
 
# Data frames
data <- data.frame(
    name = c("Alice", "Bob", "Charlie"),
    age = c(25, 30, 35),
    score = c(95, 87, 92)
)
 
# Lists
results <- list(
    model = lm(score ~ age, data = data),
    summary_stats = summary(data),
    plots = list()
)
 
# Matrices
matrix_data <- matrix(1:12, nrow = 3, ncol = 4)

Data Manipulation

Base R

# Data filtering and selection
filtered_data <- data[dataage > 25, ]
high_scores <- subset(data, score > 90)
 
# Aggregation
aggregate_results <- aggregate(score ~ department, data = employee_data, mean)
 
# Apply functions
means_by_column <- apply(numeric_data, 2, mean)
row_sums <- apply(numeric_data, 1, sum)

dplyr (Tidyverse)

library(dplyr)
library(magrittr)
 
# Modern data manipulation
result <- employee_data %>%
    filter(department == "Engineering") %>%
    group_by(level) %>%
    summarise(
        avg_salary = mean(salary),
        count = n(),
        median_experience = median(years_experience)
    ) %>%
    arrange(desc(avg_salary))
 
# Data transformation
transformed_data <- raw_data %>%
    mutate(
        log_value = log(value),
        normalized_score = (score - mean(score)) / sd(score),
        category_factor = as.factor(category)
    ) %>%
    select(id, log_value, normalized_score, category_factor)

tidyr for Data Reshaping

library(tidyr)
 
# Pivot operations
wide_data <- long_data %>%
    pivot_wider(
        names_from = variable,
        values_from = value
    )
 
long_data <- wide_data %>%
    pivot_longer(
        cols = c(var1, var2, var3),
        names_to = "variable",
        values_to = "value"
    )
 
# Separate and unite columns
separated <- data %>%
    separate(full_name, into = c("first", "last"), sep = " ")
 
united <- data %>%
    unite("full_address", street, city, state, sep = ", ")

Statistical Analysis

Descriptive Statistics

# Summary statistics
summary(data)
str(data)
 
# Custom summaries
descriptive_stats <- data %>%
    summarise(
        mean_value = mean(value, na.rm = TRUE),
        median_value = median(value, na.rm = TRUE),
        sd_value = sd(value, na.rm = TRUE),
        q25 = quantile(value, 0.25, na.rm = TRUE),
        q75 = quantile(value, 0.75, na.rm = TRUE),
        correlation = cor(value1, value2, use = "complete.obs")
    )

Inferential Statistics

# T-tests
t_test_result <- t.test(group_a, group_b)
print(t_test_result)
 
# ANOVA
anova_model <- aov(response ~ factor1 * factor2, data = experimental_data)
summary(anova_model)
TukeyHSD(anova_model)
 
# Chi-square test
chi_square <- chisq.test(contingency_table)
print(chi_square)
 
# Correlation tests
cor_test <- cor.test(x, y, method = "pearson")
print(cor_test)

Regression Analysis

# Linear regression
linear_model <- lm(dependent_var ~ independent_var1 + independent_var2, 
                   data = dataset)
summary(linear_model)
 
# Model diagnostics
par(mfrow = c(2, 2))
plot(linear_model)
 
# Multiple regression with interaction
complex_model <- lm(y ~ x1 * x2 + x3 + I(x1^2), data = dataset)
anova(linear_model, complex_model)  # Model comparison
 
# Logistic regression
logistic_model <- glm(binary_outcome ~ predictor1 + predictor2, 
                      family = binomial, data = dataset)
summary(logistic_model)

Data Visualization

Base R Graphics

# Basic plots
plot(x, y, main = "Scatter Plot", xlab = "X Variable", ylab = "Y Variable")
lines(x, fitted_values, col = "red", lwd = 2)
 
# Histograms and density plots
hist(datavariable, breaks = 30, main = "Distribution")
lines(density(datavariable), col = "red", lwd = 2)
 
# Box plots
boxplot(value ~ category, data = dataset, main = "Values by Category")
 
# Multiple plots
par(mfrow = c(2, 2))
plot(model1)
par(mfrow = c(1, 1))  # Reset

ggplot2

library(ggplot2)
 
# Grammar of graphics approach
p1 <- ggplot(data, aes(x = variable1, y = variable2)) +
    geom_point(aes(color = category, size = weight)) +
    geom_smooth(method = "lm", se = TRUE) +
    scale_color_brewer(type = "qual", palette = "Set1") +
    labs(
        title = "Relationship between Variables",
        subtitle = "With linear trend line",
        x = "Independent Variable",
        y = "Dependent Variable",
        color = "Category"
    ) +
    theme_minimal() +
    theme(
        plot.title = element_text(hjust = 0.5),
        legend.position = "bottom"
    )
 
# Faceted plots
p2 <- ggplot(time_series_data, aes(x = date, y = value)) +
    geom_line(aes(color = series)) +
    facet_wrap(~ region, scales = "free_y") +
    scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
 
# Save plots
ggsave("analysis_plot.png", p1, width = 10, height = 6, dpi = 300)

Interactive Visualizations

library(plotly)
library(DT)
 
# Interactive scatter plot
interactive_plot <- plot_ly(
    data = dataset,
    x = ~x_var,
    y = ~y_var,
    color = ~category,
    size = ~size_var,
    text = ~paste("ID:", id, "<br>Value:", value),
    hovertemplate = "%{text}<extra></extra>"
) %>%
    add_markers() %>%
    layout(
        title = "Interactive Scatter Plot",
        xaxis = list(title = "X Variable"),
        yaxis = list(title = "Y Variable")
    )
 
# Interactive data table
datatable(
    dataset,
    options = list(
        pageLength = 25,
        searchHighlight = TRUE,
        scrollX = TRUE
    ),
    filter = "top"
)

Advanced Statistical Methods

Time Series Analysis

library(forecast)
library(tseries)
 
# Time series decomposition
ts_data <- ts(monthly_data, start = c(2020, 1), frequency = 12)
decomposition <- decompose(ts_data)
plot(decomposition)
 
# ARIMA modeling
auto_arima <- auto.arima(ts_data)
summary(auto_arima)
 
# Forecasting
forecast_result <- forecast(auto_arima, h = 12)
plot(forecast_result)
 
# Seasonal adjustment
seasonally_adjusted <- seasadj(decomposition)

Machine Learning

library(caret)
library(randomForest)
library(e1071)
 
# Data preparation
set.seed(123)
train_index <- createDataPartition(datatarget, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
 
# Random Forest
rf_model <- randomForest(
    target ~ .,
    data = train_data,
    ntree = 500,
    importance = TRUE
)
 
# Variable importance
importance(rf_model)
varImpPlot(rf_model)
 
# Predictions and evaluation
predictions <- predict(rf_model, test_data)
confusionMatrix(predictions, test_datatarget)
 
# Cross-validation
cv_control <- trainControl(method = "cv", number = 10)
cv_model <- train(
    target ~ .,
    data = train_data,
    method = "rf",
    trControl = cv_control
)

Package Management

CRAN Packages

# Install packages
install.packages(c("dplyr", "ggplot2", "tidyr"))
 
# Load packages
library(dplyr)
library(ggplot2)
 
# Check installed packages
installed.packages()
 
# Update packages
update.packages()

Development Packages

# Install from GitHub
devtools::install_github("username/package_name")
 
# Install from Bioconductor
BiocManager::install("package_name")
 
# Package documentation
help(package = "dplyr")
vignette("dplyr")

R Markdown & Reporting

---
title: "Data Analysis Report"
author: "Data Scientist"
date: "`r Sys.Date()`"
output: 
  html_document:
    toc: true
    toc_float: true
    theme: flatly
---

# Analysis Overview

This report presents findings from our data analysis.

```{r setup, include=FALSE}
knitr::opts_chunkset(echo = TRUE, warning = FALSE, message = FALSE)
library(dplyr)
library(ggplot2)

# Load and analyze data
data <- read.csv("data.csv")
summary_stats <- data %>%
    group_by(category) %>%
    summarise(
        mean_value = mean(value),
        count = n()
    )

knitr::kable(summary_stats, caption = "Summary Statistics by Category")

# Create visualization
ggplot(data, aes(x = category, y = value)) +
    geom_boxplot() +
    theme_minimal() +
    labs(title = "Value Distribution by Category")

Conclusions

Based on the analysis, we found...


## Popular Packages

### Data Manipulation
- **dplyr**: Grammar of data manipulation
- **tidyr**: Tidy messy data
- **data.table**: Fast data manipulation
- **stringr**: String operations

### Visualization
- **ggplot2**: Grammar of graphics
- **plotly**: Interactive visualizations
- **lattice**: Trellis graphics
- **leaflet**: Interactive maps

### Statistical Analysis
- **caret**: Classification and regression training
- **forecast**: Time series forecasting
- **survival**: Survival analysis
- **lme4**: Mixed-effects models

### Specialized Domains
- **Bioconductor**: Bioinformatics packages
- **quantmod**: Quantitative financial modeling
- **sp/sf**: Spatial data analysis
- **tm**: Text mining

## Best Practices

### Code Organization
1. **Project structure**: Use RStudio projects
2. **Reproducibility**: Set random seeds and document versions
3. **Documentation**: Comment code and create README files
4. **Version control**: Use Git for tracking changes

### Data Analysis
1. **Exploratory analysis**: Always explore data first
2. **Data validation**: Check for outliers and missing values
3. **Assumptions**: Verify statistical assumptions
4. **Visualization**: Create informative plots

### Programming Style
```r
# Good practices
clean_data <- function(raw_data) {
    # Clear function purpose and documentation
    """
    Clean and prepare raw data for analysis
    
    Args:
        raw_data: Raw dataset from source
        
    Returns:
        Cleaned dataset ready for analysis
    """
    
    cleaned <- raw_data %>%
        filter(!is.na(key_variable)) %>%
        mutate(
            standardized_name = str_to_lower(str_trim(name)),
            date_formatted = as.Date(date_string, format = "%Y-%m-%d")
        ) %>%
        select(-temporary_columns)
    
    return(cleaned)
}

When to Choose R

Ideal For

Statistical analysis and modeling
Data exploration and visualization
Academic research and publications
Bioinformatics and life sciences
Financial analysis and risk modeling
Survey data analysis
Reproducible research reports

Consider Alternatives When

Production web applications
Real-time systems
Mobile app development
Large-scale distributed computing
Memory-intensive applications

Learning Resources

Books

R for Data Science: Tidyverse approach
The Art of R Programming: Programming fundamentals
Advanced R: Deep dive into R internals
Statistical Learning with R: Machine learning focus

Online Resources

CRAN documentation: Comprehensive package docs
RStudio resources: Tutorials and guides
R-bloggers: Community blog aggregator
Stack Overflow: Q&A community

Industry Applications

R is widely used in academia, pharmaceuticals, finance, and research organizations. Companies like Google, Microsoft, and IBM have embraced R for statistical computing and data science applications.

The language continues to evolve with strong support for modern data science workflows, making it an excellent choice for statistical analysis, research, and data-driven decision making.

JavaScript/TypeScript Go