##======================================================##
## ##
## NetSciX School of Code Workshop ##
## Network analysis with R and igraph ##
## Wroclaw, Poland, January 10 2016 ##
## ##
## Katya Ognyanova, katya@ognyanova.net ##
## www.kateto.net/netscix2016 ##
## ##
##======================================================##
# Handouts and example data: bitly.com/netscix2016
# Online version of the tutoiral: kateto.net/netscix2016
# CONTENTS
# 1. A quick R introduction/refresher
# 2. Networks in igraph
# 3. Reading network data from files
# 4. Turning networks into igraph objects
# 5. Plotting networks with igraph
# 6. Network and node descriptives
# 7. Distances and paths
# 8. Subgroups and communities
# 9. Assortativity and Homophily
# Install the package "igraph" if you don't have its latest version (1.0.1)
# The package (www.igraph.org) is maintained by Gabor Csardi and Tamas Nepusz.
install.packages("igraph")
# ================ 1. A quick R introduction/reminder ================
# You can assign a value to an object using assign(), "<-", or "=".
x <- 3 # Assignment
x # Evaluate the expression and print result
y <- 4 # Assignment
y + 5 # Evaluation, y remains 4
z <- x + 17*y # Assignment
z # Evaluation
rm(z) # Remove z: deletes the object.
z # Error!
# ------->> Value comparisons: --------
# Comparisons return boolean values: TRUE or FALSE (often abbreviated to T and F)
2==2 # Equality
2!=2 # Inequality
x <= y # less than or equal: "<", ">", and ">=" also work
# ------->> Special constants --------
# NA, NULL, Inf, -Inf, NaN
# NA - missing or undefined data
5 + NA # When used in an expression, the result is generally NA
is.na(5+NA) # Check if missing
# NULL - an empty object, e.g. a null/empty list
10 + NULL # use returns an empty object (length zero)
is.null(NULL) # check if NULL
# Inf and -Inf represent positive and negative infinity
# They can be returned by mathematical operations like division of a number by zero:
5/0
is.finite(5/0) # Check if a number is finite
# NaN (Not a Number) - the result of an operation that cannot be reasonably defined
0/0
is.nan(0/0)
# ------->> Vectors --------
v1 <- c(1, 5, 11, 33) # Numeric vector, length 4
v2 <- c("hello","world") # Character vector, length 2 (a vector of strings)
v3 <- c(TRUE, TRUE, FALSE) # Logical vector, same as c(T, T, F)
# Combining different types of elements in one vector will coerce the elements
# to the least restrictive type:
v4 <- c(v1,v2,v3,"boo") # All elements turn into strings
# Other ways to create vectors:
v <- 1:7 # same as c(1,2,3,4,5,6,7)
v <- rep(0, 77) # repeat zero 77 times: v is a vector of 77 zeroes
v <- rep(1:3, times=2) # Repeat 1,2,3 twice
v <- rep(1:10, each=2) # Repeat each element twice
v <- seq(10,20,2) # sequence: numbers between 10 and 20, in jumps of 2
length(v) # check the length of the vector
v1 <- 1:5 # 1,2,3,4,5
v2 <- rep(1,5) # 1,1,1,1,1
# Element-wise operations:
v1 + v2 # Element-wise addition
v1 + 1 # Add 1 to each element
v1 * 2 # Multiply each element by 2
v1 + c(1,7) # This doesn't work: (1,7) is a vector of different length
# Mathematical operations:
sum(v1) # The sum of all elements
mean(v1) # The average of all elements
sd(v1) # The standard deviation
cor(v1,v1*5) # Correlation between v1 and v1*5
# Logical operations:
v1 > 2 # Each element is compared to 2, returns logical vector
v1==v2 # Are corresponding elements equivalent, returns logical vector.
v1!=v2 # Are corresponding elements *not* equivalent? Same as !(v1==v2)
(v1>2) | (v2>0) # | is the boolean OR, returns a vector.
(v1>2) & (v2>0) # & is the boolean AND, returns a vector.
(v1>2) || (v2>0) # || is the boolean OR, returns a single value
(v1>2) && (v2>0) # && is the boolean AND, ditto
# Vector elements
v1[3] # third element of v1
v1[2:4] # elements 2, 3, 4 of v1
v1[c(1,3)] # elements 1 and 3 - note that your indexes are a vector
v1[c(T,T,F,F,F)] # elements 1 and 2 - only the ones that are TRUE
v1[v1>3] # v1>3 is a logical vector TRUE for elements >3
# NOTE: If you are used to languages indexing from 0, R will surprise you by indexing from 1.
# To add more elements to a vector, simply assign them values.
v1[6:10] <- 6:10
# We can also directly assign the vector a length:
length(v1) <- 15 # the last 5 elements are added as missing data: NA
# ------->> Factors --------
# Factors are used to store categorical data.
eye.col.v <- c("brown", "green", "brown", "blue", "blue", "blue") #vector
eye.col.f <- factor(c("brown", "green", "brown", "blue", "blue", "blue")) #factor
eye.col.v
eye.col.f
# R will identify the different levels of the factor - e.g. all distinct values.
# The data is stored internally as integers - each number corresponding to a factor level.
levels(eye.col.f) # The levels (distinct values) of the factor (categorical variable)
as.numeric(eye.col.f) # The factor as numeric values: 1 is blue, 2 is brown, 3 is green
as.numeric(eye.col.v) # The character vector, however, can not be coerced to numeric
as.character(eye.col.f)
as.character(eye.col.v)
# ------->> Matrces & Arrays --------
# A matrix is a vector with dimensions:
m <- rep(1, 20) # A vector of 20 elements, all 1
dim(m) <- c(5,4) # Dimensions set to 5 & 4, so m is now a 5x4 matrix
# Create a matrix using matrix():
m <- matrix(data=1, nrow=5, ncol=4) # same matrix as above, 5x4, full of 1s
m <- matrix(1,5,4) # same matrix as above
dim(m) # What are the dimensions of m?
# Create a matrix by combining vectors:
m <- cbind(1:5, 5:1, 5:9) # Bind 3 vectors as columns, 5x3 matrix
m <- rbind(1:5, 5:1, 5:9) # Bind 3 vectors as rows, 3x5 matrix
m <- matrix(1:10,10,10)
# Select matrix elements:
m[2,3] # Matrix m, row 2, column 3 - a single cell
m[2,] # The whole second row of m as a vector
m[,2] # The whole second column of m as a vector
m[1:2,4:6] # submatrix: rows 1 and 2, columns 4, 5 and 6
m[-1,] # all rows *except* the first one
m[1,]==m[,1] # Are elements in row 1 equivalent to corresponding elements from column 1?
m>3 # A logical matrix: TRUE for m elements >3, FALSE otherwise
m[m>3] # Selects only TRUE elements - that is ones greater than 3
t(m) # Transpose m
m <- t(m) # Assign m the transposed m
m %*% t(m) # %*% does matrix multiplication
m * m # * does element-wise multiplication
# Arrays: more than 2 dimensions
# Created with the array() function:
a <- array(data=1:18,dim=c(3,3,2)) # 3d with dimensions 3x3x2
a <- array(1:18,c(3,3,2)) # the same array
# ------->> Lists --------
# Lists are collections of objects (e.g. of strings, vectors, matrices, other lists, etc.)
l1 <- list(boo=v1,foo=v2,moo=v3,zoo="Animals!") # A list with four components
l2 <- list(v1,v2,v3,"Animals!")
l3 <- list()
l4 <- NULL
l1["boo"] # Access boo: this returns a list.
l1[["boo"]] # Access boo: this returns the numeric vector
l1[[1]] # Returns the first component of the list, equivalent to above.
l1$boo # Named elements can be accessed using the $ operator - equivalent to [[]]
# Add more elements to a list:
l3[[1]] <- 11 # add an element to the empty list l3
l4[[3]] <- c(22, 23) # add a vector as element 3 in the empty list l4.
# Since we added element 3, elements 1 & 2 will be generated and empty (NULL)
l1[[5]] <- "More elements!" # The list l1 had 4 elements, we're adding a 5th here.
l1[[8]] <- 1:11 # We added an 8th element, but not 6th or 7th. Those will be created empty (NULL)
l1$Something <- "A thing" # Adds a ninth element - "A thing", named "Something"
# ------->> Data Frames --------
# The data frame is a special kind of list used for storing dataset tables.
# Think of rows as cases, columns as variables. Each column is a vector or factor.
# Creating a dataframe:
dfr1 <- data.frame( ID=1:4,
FirstName=c("John","Jim","Jane","Jill"),
Female=c(F,F,T,T),
Age=c(22,33,44,55) )
dfr1$FirstName # Access the second column of dfr1.
# Notice that R thinks this is a categorical variable
# and so it's treating it like a factor, not a character vector.
# Let's get rid of the factor by telling R to treat FirstName as a vector:
dfr1$FirstName <- as.vector(dfr1$FirstName)
# Alternatively, you can tell R you don't like factors from the start using stringsAsFactors=FALSE
dfr2 <- data.frame(FirstName=c("John","Jim","Jane","Jill"), stringsAsFactors=FALSE)
dfr2$FirstName # Success: not a factor.
# Access elements of the data frame
dfr1[1,] # First row, all columns
dfr1[,1] # First column, all rows
dfr1$Age # Age column, all rows
dfr1[1:2,3:4] # Rows 1 and 2, columns 3 and 4 - the gender and age of John & Jim
dfr1[c(1,3),] # Rows 1 and 3, all columns
# Find the names of everyone over the age of 30 in the data
dfr1[dfr1$Age>30,2]
# Find the average age of all females in the data:
mean ( dfr1[dfr1$Female==TRUE,4] )
# ------->> Flow Control --------
# if (condition) expr1 else expr2
x <- 5; y <- 10
if (x==0) y <- 0 else y <- y/x #
y
# for (variable in sequence) expr
ASum <- 0; AProd <- 1
for (i in 1:x)
{
ASum <- ASum + i
AProd <- AProd * i
}
ASum # equivalent to sum(1:x)
AProd # equivalemt to prod(1:x)
# while (condintion) expr
while (x > 0) {print(x); x <- x-1;}
# repeat expr, use break to exit the loop
repeat { print(x); x <- x+1; if (x>10) break}
# ------->> R plots and colors --------
# In most R functions, you can use named colors, hex, or rgb values:
# (In the simple base plot chart below x and y are point coordiantes, pch
# is the point symbol shape, cex is the point size, and col is the color.
# to see the parameters for ploting in base R, check out ?par
plot(x=1:10, y=rep(5,10), pch=19, cex=5, col="dark red")
points(x=1:10, y=rep(6, 10), pch=19, cex=5, col="#557799")
points(x=1:10, y=rep(4, 10), pch=19, cex=5, col=rgb(.25, .5, .3))
# You may notice that rgb here ranges from 0 to 1. While this is the R default,
# you can also set it for the 0-255 range:
rgb(10, 100, 100, maxColorValue=255)
# We can also set the opacity/transparency using the parameter 'alpha' (range 0-1):
plot(x=1:5, y=rep(5,5), pch=19, cex=16, col=rgb(.25, .5, .3, alpha=.5), xlim=c(0,6))
# If we have a hex color representation, we can set the transparency alpha
# using 'adjustcolor' from package 'grDevices'. For fun, let's also set the
# the plot background to gray using the par() function for graphical parameters.
par(bg="black")
col.tr <- grDevices::adjustcolor("#557799", alpha=0.7)
plot(x=1:5, y=rep(5,5), pch=19, cex=20, col=col.tr, xlim=c(0,6))
par(bg="white")
# If you plan on using the built-in color names, here's what they are:
colors()
grep("blue", colors(), value=T)
# In many cases, we need a number of contrasting colors, or multiple shades of a color.
# R comes with some predefined palette function that can generate those for us.
pal1 <- heat.colors(5, alpha=1) # generate 5 colors from the heat palette, opaque
pal2 <- rainbow(5, alpha=.5) # generate 5 colors from the heat palette, semi-transparent
plot(x=1:10, y=1:10, pch=19, cex=10, col=pal1)
plot(x=10:1, y=1:10, pch=19, cex=10, col=pal2)
# We can also generate our own gradients using colorRampPalette.
# Note that colorRampPalette returns a *function* that we can use
# to generate as many colors from that palette as we need.
palf <- colorRampPalette(c("gray70", "red"))
plot(x=10:1, y=1:10, pch=19, cex=10, col=palf(100))
# To add transparency to colorRampPalette, you need to add a parameter `alpha=TRUE`:
palf <- colorRampPalette(c(rgb(1,1,1, .2),rgb(.8,0,0, .7)), alpha=TRUE)
plot(x=10:1, y=1:10, pch=19, cex=10, col=palf(10))