R - Getting Started

4 minute read

Published:

Download:

Resources

Basic Commands

  • Functions are used to perform operations

  • Concatenate to create a Vector

    •   x = c(1,2,3,4,5) # 1 2 3 4 5
        x <- c(1,2,3,4,5) # 1 2 3 4 5
      
  • Help

    •   ?c
      
  • Adding two vectors

    •   x = c(1,2,3,4,5)
        y = c(1,2,3,4,5)
        z = x+y # 2  4  6  8 10
        z = z+2 # 4  6  8 10 12
              
        k = c(2,3)
        z = x+k # Error
              
        k = c(2)
        z = x+k # 3 4 5 6 7
      
  • Length of vector

    •   length(x) # 5
        length(y) # 5
        length(k) # 2
      
  • List of All Objects and Remove Object(s)

    •   ls() # x, y, k, z
        rm(k, z)
              
        rm(list=ls())
      
  • Creating Function

    •   f = function(x, y) x^2 + y^2
        f(10, 10) # 200
              
        f = function(x, y){ # Shift+Enter
             z = x^2 + y^2
             z
         }
        f(10, 10) # 200
      
  • Matrix

    •   ?matrix
        x = matrix(data=c(1, 2, 3, 4), nrow=2, ncol=2)
              
        x = matrix(c(1, 2, 3, 4), 2, 2)
              
        x = matrix(nrow=2, ncol=2, data=c(1, 2, 3, 4))
              
        dim(x)
      
  • Outer

    •   x = 1:10
        y = x
        z = x %o% y # default multiplication operator
        z = outer(x,y)
        z = outer(x, y, "+")
              
        f = function(x, y) x^2 + y^2
        z = outer(x, y, f)
      
  • Other Functions

    •   sqrt(x)
        x^2
              
        # rnorm() generates vector of normal variables with mean 0 and std 1
        x = rnorm(50)
        y = rnorm(50, mean=50, sd=0.1)
              
        # Correlation
        cor(x, y)
              
        # Seed
        set.seed(5); x = rnorm(5); x
        set.seed(5); x = rnorm(5); x
              
        # Mean, Variance, Standard Deviation
        mean(x); var(x); sd(x)
      

Graphics

  •   x = rnorm(100)
      y = rnorm(100)
        
      plot(x,y)
      plot(x, y, xlim=c(0,2), ylim=c(0,2))
        
      plot(x, y, type="p") # "l" for lines and "b" for both points and lines
        
      plot(x,y, main="Scatter Plot", xlab="x-axis", ylab="y-axis") # xlabel and ylabel
        
      # Saving plot in pdf
      pdf(file = "/Users/naneja/Downloads/xy.pdf")
      plot(x,y, main="xy plot", xlab="x-axis", ylab="y-axis")
      dev.off() # complete plotting
        
      # Saving plot in jpg
      jpeg(file = "/Users/naneja/Downloads/xy.jpg")
      plot(x,y, main="xy plot", xlab="x-axis", ylab="y-axis")
      dev.off() # complete plotting
    
  • Sequence of Numbers

    •   # Sequence for vector of integers
        x = seq(1, 10); x
        x = 1:10; x
              
        # Sequence for vector of integers equally spaced
        x = seq(1, 10, length=5); x
        x <- seq(-pi, pi, length = 5); x
      
  • 3D Plots

    • First dimension: a vector of the x values
    • Second dimension: a vector of the y values
    • Third dimension: a matrix of the z values whose elements correspond to each pair of (x, y) coordinates

    •   x = 1:10
        y = x
              
        f = function(x, y) cos(y) / (1 + x^2)
        z = outer(x, y, f)
              
        # 3D Plot
        persp(x, y, z)
        persp(x, y, z , theta=30)
        persp(x, y, fa , theta=30, phi=70)
        persp(x, y, fa , theta=30, phi=40)
              
        # Heatmap based on values of z
        image(x, y, z)
      

Indexing Data

  •   A = matrix(1:16, 4, 4)
        
      A[4,4] # 16
      A[c(1,2,3), c(1,2,3)] # 3x3
      A[1:3, 1:3]
        
      A[4,]
      A[,4]
        
      A[1:3,]
      A[,1:3]
        
      A[-1,]
      A[-c(1,2),]
      A[-c(1,4),]
    

Loading Data

# https://www.kaggle.com/c/titanic/data
getwd()
setwd("/Users/naneja/Downloads/R")

data = read.csv("titanic/train.csv")
data[0,] # column names
install.packages("ISLR2")
library(ISLR2)

Auto = ISLR2::Auto
Auto = read.csv("data/auto.csv")

View(Auto)
head(Auto)

dim(Auto)

Auto = read.csv("data/auto.csv", header = T, na.strings = c("?"), stringsAsFactors = T)

Auto[1:4, ]

Auto <- na.omit(Auto)
dim(Auto)

names(Auto)
  • na.strings
    • chars replaced with NA (missing element)
  • stringsAsFactors = T
    • any variable containing character strings should be interpreted as a qualitative variable, and that each distinct character string represents a distinct level for that qualitative variable.

Additional Graphical and Numerical Summaries

  • Scatterplot of quantitative variables

    •   library(ISLR2)
        Auto = ISLR2::Auto
              
        names(Auto)
              
        plot(Auto$cylinders , Auto$mpg)
              
        attach(Auto)
        plot(cylinders , mpg)
        detach()
      
  • Converting Quantative to Qualitative

    • Cylinders variable has small number of possible values so may be converted to qualitative

    •   cylinders <- as.factor(cylinders)
              
        plot(cylinders , mpg) # Box Plot
              
        plot(cylinders , mpg , col="red")
              
        plot(cylinders , mpg , col="red", varwidth=T)
              
        plot(cylinders , mpg , col="red", varwidth=T, horizontal=T)
              
        plot(cylinders , mpg , col="red", varwidth=T, xlab="cylinders", ylab="MPG")
      
  • Histogram

    • Numberic Variable

    •   hist(mpg)
              
        hist(mpg , col=2) # red
              
        hist(mpg , col=2, breaks=15) # breaks is suggestion only
      
  • Pairs - Scatterplot Matrix

    •   pairs(Auto) # for every pair of variables
              
        pairs(~ mpg + displacement + horsepower + weight + acceleration, data = Auto) # subset of variables 
      
  • Identify

    •   plot(horsepower , mpg)
              
        # Label some points with a variable
        names(Auto) # Field names
              
        identify(horsepower , mpg , name) # Select points to be labeled as Esc
      
  • Summary

    •   summary(Auto) # Numeric summary of each variable
              
        summary(mpg) # Summary of single variable