# Set working directory
setwd('/Users/surajjoshi/R-repo/')
# Set seed
set.seed(100)
# Import libraries
require(dplyr)
require(ggplot2)
require(scales)
# Download file
# from web
require(downloader)
url<- span="">raw.githubusercontent.com/genomicsclass/dagdata/master/inst/extdata/msleep_ggplot2.csv"->
filename <- span=""> 'msleep_ggplot2.csv'->
if (!file.exists(filename)) download(url, filename)
msleep<- filename="" read.csv="" span="">->
# from working directory
msleep<- filename="" read.csv="" span="">->
# Survey of data
head(df)
dim(df)
nrow(df# row only
ncol(df) # column only
rownames(df)
colnames(df) = names(df)
class(df$rank)
glimpse(df)
summary(df)
str(df)
# View(df)
tbl_df(df)
#unique(df$name)
count(df, smoke)
length(subset(df, readmitted=='<30 span=""> #gives the number of columns30>
unique(df)
scale_shape_discrete(name = 'Admit', labels =c('No', 'Yes'))
#find the number of records
glimpse(main[main$Math>80,])
nrow((main[main$Math>80,]))\
cln2 [ which(cln2$dose == 100), ]
EUR <- span=""> PCs[which(PCs$V13 < 9 & PCs$V13 > 3), ]->
file and directory navigation
dir() #lists file and folders in the current directory
getwd() # returns the current directory
setwd('/Users/surajjoshi/R-repo/') # Sets the working directory to the given path
ls()
options()
history()
col.names = c('abc', 'def')
colClasses = c("character", "character", "complex",
"factor", "factor", "character", "integer",
"integer", "numeric", "character", "character",
"Date", "integer", "logical")))
# Exploratory Analysis
# three ways to do one task
- highrank <- 3="" df="" rank="" span="">->
- subsetrank <-subset df="" rank="" span="">-subset>
- filterrank <- 3="" df="" filter="" rank="" span="">->
filter(df, age <=20 & age >=19) %>% nrow
Random sampling
sample_n(df, 10, replace = T)
sample_frac(df, .1, replace = T)%>%dim
count(df, smoke)
# Apply a function to select columns
apply(df[,c(2,3,4)], 1, sum, na.rm=T) # adds each row
apply(df[,c(2,3,4)], 2, sum, na.rm=T) # adds each column
# Transformation
main2 <- division="ifelse(Percentage" main="" transform="">=75, 'Distinction', ifelse(Percentage<40 ail="" ercentage="" ifelse="">=60, 'First', ifelse(Percentage>=50, 'Second', 'Third')))))40>->
main2
# Merge
mainFinal <- main2="" main3="" main="" rbind="" span="">->
mainFinal2 <- by="ID" main4="" mainfinal="" merge="" span="">->
mainFinal2 <- -division="" -percentage="" -total="" mainfinal2="" select="" span="">->
head(mainFinal2)
# Renaming column name
head(mainFinal2)
colnames(mainFinal2)[3] <- ath="" span="">->
colnames(mainFinal2)[4] <- ci="" span="">->
head(mainFinal2)
# Adding a column
df$new_column_name = 1:60
# Removing a column
df$new_column_name <- null="" span="">->
Select
nth(main$Eng,2) # selecting 2nd row in Eng column
Selecting rows with certain values
mainFinal2[mainFinal2$Division=='First',]
writing csv file
# write.csv(carbon, file="carbon.csv")
Subset
Filter
Aggregate
# Visualization
## Basic Visualization
which(main==80)
match(main$Math, 80)
unique(main$Eng)
plot(main[,2:6], main$ID)
data("mtcars")
hist(mtcars$mpg, 20, freq = F)
lines(density(mtcars$mpg), col='blue', lwd=2)
fit1 <- span=""> lm(mpg~hp, mtcars)->
summary(fit1)
plot(mpg~hp, mtcars)
abline(reg=fit1)
summary(mtcars$hp)
any(is.na(mtcars$hp))
sum(is.na(mtcars$hp))
sum(!is.na(mtcars$hp))
is.na(mtcars$hp)<- hp="=335)</span" mtcars="" which="">->
fit2 <- span=""> lm(mpg~hp, mtcars)->
summary(fit2)
plot(mpg~hp, mtcars)
abline(reg=fit2)
plot(fit2)
grouped <- division="" group_by="" main3="" span="">->
summarise(grouped, last)
last(grouped$Percentage)
df %.% group_by(State) %.% arrange(Value) %.% filter(Value == nth(Value, 3))
main3%>% group_by(Division)%>% arrange(Percentage)%>%filter(Percentage==nth(Percentage,1))
Mark values in a range
table<- 2000="" 2500="" 300="" 5000="" 900="" data.frame="" habitat="c(1,2,3,4,5,6))</span" population="c(100,">->
table$size[table$population <500 1="" span="">500>
table$size[table$population >=500 & table$population <=1000] <- 2="" span="">->
table$size[table$population >1000 & table$population <=3000] <- 3="" span="">->
table$size[table$population >3000] <- 4="" span="">->
table$size2 <- 1000="" 2000="" 3000="" 5000="" 500="" c="" findinterval="" population="" rightmost.closed="TRUE)</span" table="">->
table$size <- 1000="" 2000="" 3000="" 5000="" 500="" c="" findinterval="" population="" rightmost.closed="TRUE)</span" table="">->
Remove Duplicates
x <- 0:8="" 1:5="" 3:7="" c="" span="">->
x
(xu <- duplicated="" span="" x="">->
No comments:
Post a Comment