##======================================================## ## ## ## NetSciX School of Code Workshop ## ## Network analysis with R and igraph ## ## Wroclaw, Poland, January 10 2016 ## ## ## ## Katya Ognyanova, katya@ognyanova.net ## ## www.kateto.net/netscix2016 ## ## ## ##======================================================## # Handouts and example data: bitly.com/netscix2016 # Online version of the tutoiral: kateto.net/netscix2016 # CONTENTS # 1. A quick R introduction/refresher # 2. Networks in igraph # 3. Reading network data from files # 4. Turning networks into igraph objects # 5. Plotting networks with igraph # 6. Network and node descriptives # 7. Distances and paths # 8. Subgroups and communities # 9. Assortativity and Homophily # Install the package "igraph" if you don't have its latest version (1.0.1) # The package (www.igraph.org) is maintained by Gabor Csardi and Tamas Nepusz. install.packages("igraph") # ================ 1. A quick R introduction/reminder ================ # You can assign a value to an object using assign(), "<-", or "=". x <- 3 # Assignment x # Evaluate the expression and print result y <- 4 # Assignment y + 5 # Evaluation, y remains 4 z <- x + 17*y # Assignment z # Evaluation rm(z) # Remove z: deletes the object. z # Error! # ------->> Value comparisons: -------- # Comparisons return boolean values: TRUE or FALSE (often abbreviated to T and F) 2==2 # Equality 2!=2 # Inequality x <= y # less than or equal: "<", ">", and ">=" also work # ------->> Special constants -------- # NA, NULL, Inf, -Inf, NaN # NA - missing or undefined data 5 + NA # When used in an expression, the result is generally NA is.na(5+NA) # Check if missing # NULL - an empty object, e.g. a null/empty list 10 + NULL # use returns an empty object (length zero) is.null(NULL) # check if NULL # Inf and -Inf represent positive and negative infinity # They can be returned by mathematical operations like division of a number by zero: 5/0 is.finite(5/0) # Check if a number is finite # NaN (Not a Number) - the result of an operation that cannot be reasonably defined 0/0 is.nan(0/0) # ------->> Vectors -------- v1 <- c(1, 5, 11, 33) # Numeric vector, length 4 v2 <- c("hello","world") # Character vector, length 2 (a vector of strings) v3 <- c(TRUE, TRUE, FALSE) # Logical vector, same as c(T, T, F) # Combining different types of elements in one vector will coerce the elements # to the least restrictive type: v4 <- c(v1,v2,v3,"boo") # All elements turn into strings # Other ways to create vectors: v <- 1:7 # same as c(1,2,3,4,5,6,7) v <- rep(0, 77) # repeat zero 77 times: v is a vector of 77 zeroes v <- rep(1:3, times=2) # Repeat 1,2,3 twice v <- rep(1:10, each=2) # Repeat each element twice v <- seq(10,20,2) # sequence: numbers between 10 and 20, in jumps of 2 length(v) # check the length of the vector v1 <- 1:5 # 1,2,3,4,5 v2 <- rep(1,5) # 1,1,1,1,1 # Element-wise operations: v1 + v2 # Element-wise addition v1 + 1 # Add 1 to each element v1 * 2 # Multiply each element by 2 v1 + c(1,7) # This doesn't work: (1,7) is a vector of different length # Mathematical operations: sum(v1) # The sum of all elements mean(v1) # The average of all elements sd(v1) # The standard deviation cor(v1,v1*5) # Correlation between v1 and v1*5 # Logical operations: v1 > 2 # Each element is compared to 2, returns logical vector v1==v2 # Are corresponding elements equivalent, returns logical vector. v1!=v2 # Are corresponding elements *not* equivalent? Same as !(v1==v2) (v1>2) | (v2>0) # | is the boolean OR, returns a vector. (v1>2) & (v2>0) # & is the boolean AND, returns a vector. (v1>2) || (v2>0) # || is the boolean OR, returns a single value (v1>2) && (v2>0) # && is the boolean AND, ditto # Vector elements v1[3] # third element of v1 v1[2:4] # elements 2, 3, 4 of v1 v1[c(1,3)] # elements 1 and 3 - note that your indexes are a vector v1[c(T,T,F,F,F)] # elements 1 and 2 - only the ones that are TRUE v1[v1>3] # v1>3 is a logical vector TRUE for elements >3 # NOTE: If you are used to languages indexing from 0, R will surprise you by indexing from 1. # To add more elements to a vector, simply assign them values. v1[6:10] <- 6:10 # We can also directly assign the vector a length: length(v1) <- 15 # the last 5 elements are added as missing data: NA # ------->> Factors -------- # Factors are used to store categorical data. eye.col.v <- c("brown", "green", "brown", "blue", "blue", "blue") #vector eye.col.f <- factor(c("brown", "green", "brown", "blue", "blue", "blue")) #factor eye.col.v eye.col.f # R will identify the different levels of the factor - e.g. all distinct values. # The data is stored internally as integers - each number corresponding to a factor level. levels(eye.col.f) # The levels (distinct values) of the factor (categorical variable) as.numeric(eye.col.f) # The factor as numeric values: 1 is blue, 2 is brown, 3 is green as.numeric(eye.col.v) # The character vector, however, can not be coerced to numeric as.character(eye.col.f) as.character(eye.col.v) # ------->> Matrces & Arrays -------- # A matrix is a vector with dimensions: m <- rep(1, 20) # A vector of 20 elements, all 1 dim(m) <- c(5,4) # Dimensions set to 5 & 4, so m is now a 5x4 matrix # Create a matrix using matrix(): m <- matrix(data=1, nrow=5, ncol=4) # same matrix as above, 5x4, full of 1s m <- matrix(1,5,4) # same matrix as above dim(m) # What are the dimensions of m? # Create a matrix by combining vectors: m <- cbind(1:5, 5:1, 5:9) # Bind 3 vectors as columns, 5x3 matrix m <- rbind(1:5, 5:1, 5:9) # Bind 3 vectors as rows, 3x5 matrix m <- matrix(1:10,10,10) # Select matrix elements: m[2,3] # Matrix m, row 2, column 3 - a single cell m[2,] # The whole second row of m as a vector m[,2] # The whole second column of m as a vector m[1:2,4:6] # submatrix: rows 1 and 2, columns 4, 5 and 6 m[-1,] # all rows *except* the first one m[1,]==m[,1] # Are elements in row 1 equivalent to corresponding elements from column 1? m>3 # A logical matrix: TRUE for m elements >3, FALSE otherwise m[m>3] # Selects only TRUE elements - that is ones greater than 3 t(m) # Transpose m m <- t(m) # Assign m the transposed m m %*% t(m) # %*% does matrix multiplication m * m # * does element-wise multiplication # Arrays: more than 2 dimensions # Created with the array() function: a <- array(data=1:18,dim=c(3,3,2)) # 3d with dimensions 3x3x2 a <- array(1:18,c(3,3,2)) # the same array # ------->> Lists -------- # Lists are collections of objects (e.g. of strings, vectors, matrices, other lists, etc.) l1 <- list(boo=v1,foo=v2,moo=v3,zoo="Animals!") # A list with four components l2 <- list(v1,v2,v3,"Animals!") l3 <- list() l4 <- NULL l1["boo"] # Access boo: this returns a list. l1[["boo"]] # Access boo: this returns the numeric vector l1[[1]] # Returns the first component of the list, equivalent to above. l1$boo # Named elements can be accessed using the $ operator - equivalent to [[]] # Add more elements to a list: l3[[1]] <- 11 # add an element to the empty list l3 l4[[3]] <- c(22, 23) # add a vector as element 3 in the empty list l4. # Since we added element 3, elements 1 & 2 will be generated and empty (NULL) l1[[5]] <- "More elements!" # The list l1 had 4 elements, we're adding a 5th here. l1[[8]] <- 1:11 # We added an 8th element, but not 6th or 7th. Those will be created empty (NULL) l1$Something <- "A thing" # Adds a ninth element - "A thing", named "Something" # ------->> Data Frames -------- # The data frame is a special kind of list used for storing dataset tables. # Think of rows as cases, columns as variables. Each column is a vector or factor. # Creating a dataframe: dfr1 <- data.frame( ID=1:4, FirstName=c("John","Jim","Jane","Jill"), Female=c(F,F,T,T), Age=c(22,33,44,55) ) dfr1$FirstName # Access the second column of dfr1. # Notice that R thinks this is a categorical variable # and so it's treating it like a factor, not a character vector. # Let's get rid of the factor by telling R to treat FirstName as a vector: dfr1$FirstName <- as.vector(dfr1$FirstName) # Alternatively, you can tell R you don't like factors from the start using stringsAsFactors=FALSE dfr2 <- data.frame(FirstName=c("John","Jim","Jane","Jill"), stringsAsFactors=FALSE) dfr2$FirstName # Success: not a factor. # Access elements of the data frame dfr1[1,] # First row, all columns dfr1[,1] # First column, all rows dfr1$Age # Age column, all rows dfr1[1:2,3:4] # Rows 1 and 2, columns 3 and 4 - the gender and age of John & Jim dfr1[c(1,3),] # Rows 1 and 3, all columns # Find the names of everyone over the age of 30 in the data dfr1[dfr1$Age>30,2] # Find the average age of all females in the data: mean ( dfr1[dfr1$Female==TRUE,4] ) # ------->> Flow Control -------- # if (condition) expr1 else expr2 x <- 5; y <- 10 if (x==0) y <- 0 else y <- y/x # y # for (variable in sequence) expr ASum <- 0; AProd <- 1 for (i in 1:x) { ASum <- ASum + i AProd <- AProd * i } ASum # equivalent to sum(1:x) AProd # equivalemt to prod(1:x) # while (condintion) expr while (x > 0) {print(x); x <- x-1;} # repeat expr, use break to exit the loop repeat { print(x); x <- x+1; if (x>10) break} # ------->> R plots and colors -------- # In most R functions, you can use named colors, hex, or rgb values: # (In the simple base plot chart below x and y are point coordiantes, pch # is the point symbol shape, cex is the point size, and col is the color. # to see the parameters for ploting in base R, check out ?par plot(x=1:10, y=rep(5,10), pch=19, cex=5, col="dark red") points(x=1:10, y=rep(6, 10), pch=19, cex=5, col="#557799") points(x=1:10, y=rep(4, 10), pch=19, cex=5, col=rgb(.25, .5, .3)) # You may notice that rgb here ranges from 0 to 1. While this is the R default, # you can also set it for the 0-255 range: rgb(10, 100, 100, maxColorValue=255) # We can also set the opacity/transparency using the parameter 'alpha' (range 0-1): plot(x=1:5, y=rep(5,5), pch=19, cex=16, col=rgb(.25, .5, .3, alpha=.5), xlim=c(0,6)) # If we have a hex color representation, we can set the transparency alpha # using 'adjustcolor' from package 'grDevices'. For fun, let's also set the # the plot background to gray using the par() function for graphical parameters. par(bg="black") col.tr <- grDevices::adjustcolor("#557799", alpha=0.7) plot(x=1:5, y=rep(5,5), pch=19, cex=20, col=col.tr, xlim=c(0,6)) par(bg="white") # If you plan on using the built-in color names, here's what they are: colors() grep("blue", colors(), value=T) # In many cases, we need a number of contrasting colors, or multiple shades of a color. # R comes with some predefined palette function that can generate those for us. pal1 <- heat.colors(5, alpha=1) # generate 5 colors from the heat palette, opaque pal2 <- rainbow(5, alpha=.5) # generate 5 colors from the heat palette, semi-transparent plot(x=1:10, y=1:10, pch=19, cex=10, col=pal1) plot(x=10:1, y=1:10, pch=19, cex=10, col=pal2) # We can also generate our own gradients using colorRampPalette. # Note that colorRampPalette returns a *function* that we can use # to generate as many colors from that palette as we need. palf <- colorRampPalette(c("gray70", "red")) plot(x=10:1, y=1:10, pch=19, cex=10, col=palf(100)) # To add transparency to colorRampPalette, you need to add a parameter `alpha=TRUE`: palf <- colorRampPalette(c(rgb(1,1,1, .2),rgb(.8,0,0, .7)), alpha=TRUE) plot(x=10:1, y=1:10, pch=19, cex=10, col=palf(10))