-
Notifications
You must be signed in to change notification settings - Fork 128
/
summarise_data.R
45 lines (37 loc) · 1008 Bytes
/
summarise_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# summarise_data.R
# Bastiaan Quast
# file size (in MegaBytes/MB)
file.info("final/en_US/en_US.blogs.txt")$size / 1024^2
file.info("final/en_US/en_US.news.txt")$size / 1024^2
file.info("final/en_US/en_US.twitter.txt")$size / 1024^2
# number of lines
load("blogs.RData")
load("news.RData")
load("twitter.RData")
# number of lines
length(blogs)
length(news)
length(twitter)
# number of characters per line
summary( nchar(blogs) )
summary( nchar(news) )
summary( nchar(twitter) )
# more character analysis analysis
library(stringi)
stats_blogs <- stri_stats_general(blogs)
stats_news <- stri_stats_general(news)
stats_twitter <- stri_stats_general(twitter)
# textual analysis
words_blogs <- stri_count_words(blogs)
words_news <- stri_count_words(news)
words_twitter <- stri_count_words(twitter)
# summaries
summary( words_blogs )
summary( words_news )
summary( words_twitter )
# plots
library(ggplot2)
qplot(words_blogs)
qplot(words_news)
qplot(words_twitter)