SlideShare ist ein Scribd-Unternehmen logo
1 von 15
Downloaden Sie, um offline zu lesen
Happy,	
  Sad,	
  Indifferent	
  …	
  
Quan3fying	
  Text	
  Sen3ment	
  in	
  R	
  

              Rajarshi	
  Guha	
  
                           	
  
             CT	
  R	
  Users	
  Group	
  
                   May	
  2012	
  
Preamble	
  
•  hHps://github.com/rajarshi/ctrug-­‐tweet	
  
•  Focus	
  is	
  on	
  using	
  R	
  to	
  perform	
  this	
  task	
  
•  Won’t	
  comment	
  on	
  validity,	
  rigor,	
  u3lity,	
  …	
  of	
  
   sen3ment	
  analysis	
  methods	
  
•  Some	
  of	
  the	
  example	
  data	
  is	
  available	
  freely,	
  
   other	
  parts	
  available	
  on	
  request	
  	
  
GeUng	
  TwiHer	
  Data	
  
•  Based	
  on	
  a	
  collabora3on	
  with	
  Prof.	
  Debs	
  Ghosh	
  
   (Uconn),	
  studying	
  obesity	
  &	
  social	
  media	
  
•  Accessing	
  TwiHer	
  is	
  easy	
  using	
  many	
  languages	
  
    –  We	
  obtained	
  tweets	
  via	
  a	
  PHP	
  client	
  running	
  over	
  an	
  
       extended	
  period	
  of	
  3me	
  
    –  Ended	
  up	
  with	
  108,164	
  tweets	
  
•  Won’t	
  focus	
  on	
  accessing	
  TwiHer	
  data	
  from	
  R	
  
    –  Very	
  straighaorward	
  with	
  twitteR	
  
Cleaning	
  Text	
  
•  Load	
  in	
  tweet	
  data,	
  get	
  rid	
  of	
  urls,	
  HTML	
  
   escape	
  codes,	
  punctua3on	
  etc	
  
d	
  <-­‐	
  read.csv('pizza-­‐unique.csv',	
  colClass='character',	
  	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  comment='',	
  header=TRUE)	
  
d$geox	
  <-­‐	
  as.numeric(d$geox)	
  
d$geoy	
  <-­‐	
  as.numeric(d$geoy)	
  
	
  
remove.urls	
  <-­‐	
  function(x)	
  gsub("http.*$",	
  "",	
  gsub('http.*s',	
  '	
  ',	
  x))	
  
remove.html	
  <-­‐	
  function(x)	
  gsub('&quot;',	
  '',	
  x)	
  
	
  
d$text	
  <-­‐	
  remove.urls(d$text)	
  
d$text	
  <-­‐	
  remove.html(d$text)	
  
d$text	
  <-­‐	
  gsub("@",	
  "FOOBAZ",	
  d$text)	
  
d$text	
  <-­‐	
  gsub("[[:punct:]]+",	
  "	
  ",	
  d$text)	
  
d$text	
  <-­‐	
  gsub("FOOBAZ",	
  "@",	
  d$text)	
  
d$text	
  <-­‐	
  gsub("[[:space:]]+",	
  '	
  ',	
  d$text)	
  
d$text	
  <-­‐	
  tolower(d$text)	
  
Quan3fying	
  Sen3ment	
  
•  Based	
  on	
  iden3fying	
  words	
  with	
  posi3ve	
  or	
  
   nega3ve	
  connota3ons	
  
•  Fundamentally	
  based	
  on	
  looking	
  up	
  words	
  
   from	
  a	
  dic3onary	
  
•  If	
  a	
  tweet	
  has	
  more	
  posi3ve	
  words	
  than	
  
   nega3ve	
  words,	
  the	
  tweet	
  is	
  posi3ve	
  
•  More	
  sophis3cated	
  scoring	
  schemes	
  are	
  
   possible	
  
BeHer	
  Dic3onaries?	
  
•  Sen3WordNet	
  
   –  Derived	
  from	
  WordNet,	
  each	
  term	
  is	
  assigned	
  a	
  
      posi3vity	
  and	
  nega3vity	
  score	
  
   –  206K	
  terms	
  
   –  Converted	
  to	
  simple	
  	
  
                                                  1.0




      CSV	
  for	
  easy	
  import	
  	
          0.8




      into	
  R	
                                 0.6                                      Sentiment
                                     Proportion




•  Ideally,	
  should	
  	
  
                                                                                              negative
                                                                                              neutral
                                                                                              positive
                                                  0.4


   perform	
  POS	
  tagging	
  
                                                  0.2




                                                  0.0

                                                        adjective   adverb   noun   verb
Scoring	
  Tweets	
  
•  Given	
  a	
  scoring	
  func3on,	
  we	
  can	
  process	
  the	
  
   tweets	
                  swn	
  <-­‐	
  read.csv('sentinet_r.csv',	
  header=TRUE,	
  	
  
     –  Perfect	
  use	
          	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  as.is=TRUE)	
  
                                  	
  
        case	
  for	
  	
         swn.match	
  <-­‐	
  function(w)	
  {	
  
        parallel	
  	
            	
  	
  tmp	
  <-­‐	
  subset(swn,	
  Term	
  ==	
  w)	
  
                                  	
  	
  if	
  (nrow(tmp)	
  >=	
  1)	
  return(tmp[1,c(3,4)])	
  
        processing	
              	
  	
  else	
  return(c(0,0))	
  
                                  }	
  
     –  Easily	
  switch	
        	
  
        out	
  the	
  	
          score.swn	
  <-­‐	
  function(tweet)	
  {	
  
                                  	
  	
  words	
  <-­‐	
  strsplit(tweet,	
  "s+")[[1]]	
  
        scoring	
  	
             	
  	
  cs	
  <-­‐	
  colSums(do.call('rbind',	
  

        func3on	
  
                                  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  lapply(words,	
  function(z)	
  	
  
                                  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  swn.match(z))))	
  
                                  	
  	
  return(cs[1]-­‐cs[2])	
  
                                  }	
  
                                  	
  
                                  scores	
  <-­‐	
  mclapply(d$text,	
  score.swn)	
  
Profiling	
  Makes	
  Me	
  Happy	
  
                                              swn.match	
  <-­‐	
  function(w)	
  {	
  
        •  6052	
  sec	
  with	
              	
  	
  tmp	
  <-­‐	
  subset(swn,	
  Term	
  ==	
  w)	
  
                                              	
  	
  if	
  (nrow(tmp)	
  >=	
  1)	
  return(tmp[1,c(3,4)])	
  
           24	
  cores	
                      	
  	
  else	
  return(c(0,0))	
  
                                              }	
  

        •  Rprof()	
  is	
  a	
  	
  
                                              score.swn	
  <-­‐	
  function(tweet)	
  {	
  
                                              	
  	
  words	
  <-­‐	
  strsplit(tweet,	
  "s+")[[1]]	
  

           good	
  way	
  to	
  
                                              	
  	
  cs	
  <-­‐	
  colSums(do.call('rbind',	
  
                                              	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  lapply(words,	
  function(z)	
  	
  
                                              	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  swn.match(z))))	
  
           iden3fy	
  	
                      	
  	
  return(cs[1]-­‐cs[2])	
  

           boHlenecks*	
  
                                              }	
  
                                              	
  
                                              score.swn.2	
  <-­‐	
  function(tweet)	
  {	
  
        •  461	
  sec	
  with	
               	
  	
  words	
  <-­‐	
  strsplit(tweet,	
  "s+")[[1]]	
  
                                              	
  	
  rows	
  <-­‐	
  match(words,	
  swn$Term)	
  
           24	
  cores	
                      	
  	
  rows	
  <-­‐	
  rows[!is.na(rows)]	
  
                                              	
  	
  cs	
  <-­‐	
  colSums(swn[rows,c(3,4)])	
  
                                              	
  	
  return(cs[1]-­‐cs[2])	
  	
  	
  
                                              }	
  
                                              	
  
*	
  overkill	
  for	
  this	
  example	
  
Looking	
  at	
  the	
  Scores	
  
 •  Bulk	
  of	
  the	
  tweets	
                                    2.5



    are	
  neutral	
                                                 2.0




 •  Similar	
  behavior	
                                                                                            Method




                                                           density
                                                                     1.5
                                                                                                                        SWN



    from	
  either	
  	
  
                                                                                                                        Breen

                                                                     1.0



    scoring	
  func3on	
                                             0.5




                                                                     0.0

                                                                           -6   -4     -2       0        2   4   6
                                                                                        Sentiment Scores
d$swn	
  <-­‐	
  unlist(scores.swn)	
  
d$breen	
  <-­‐	
  unlist(scores.breen)	
  
	
  
tmp	
  <-­‐	
  rbind(data.frame(Method='SWN',	
  Scores=d$swn),	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  data.frame(Method='Breen',	
  Scores=d$breen))	
  
ggplot(tmp,	
  aes(x=Scores,	
  fill=Method))	
  +	
  
	
  	
  geom_density(alpha=0.25)	
  +	
  
	
  	
  xlab("Sentiment	
  Scores")	
  
Sen3ment	
  &	
  Time	
  of	
  Day	
  
   •  Group	
  tweets	
  by	
  hour	
  and	
  evaluate	
  how	
  
      propor3ons	
  of	
  posi3ve,	
  nega3ve,	
  etc	
  vary	
  .	
  
tmp	
  <-­‐	
  d	
  
tmp$hour	
  <-­‐	
  strptime(d$time,	
  format='%a,	
  %d	
  %b	
  %Y	
  %H:%M')$hour	
  
	
  
tmp	
  <-­‐	
  subset(tmp,	
  !is.na(swn))	
  
tmp$status	
  <-­‐	
  sapply(tmp$swn,	
  function(x)	
  {	
  
	
  	
  if	
  (x	
  >	
  0)	
  return("Positive")	
  
	
  	
  else	
  if	
  (x	
  <	
  0)	
  return("Negative")	
  
	
  	
  else	
  return("Neutral")	
  
})	
  
	
  
tmp	
  <-­‐	
  data.frame(do.call('rbind',	
  	
  
         	
      	
       	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  by(tmp,	
  tmp$hour,	
  function(x)	
  table(x$status))))	
  
tmp$Hour	
  <-­‐	
  factor(rownames(tmp),	
  levels=0:23)	
  
tmp	
  <-­‐	
  melt(tmp,	
  id='Hour',	
  variable_name='Sentiment')	
  
ggplot(tmp,	
  aes(x=Hour,y=value,fill=Sentiment))+geom_bar(position='fill')+	
  
	
  	
  xlab("")+ylab("Proportion")	
  
	
  
Sen3ment	
  &	
  Time	
  of	
  Day	
  
             1.0




             0.8




             0.6                                                                                       Sentiment
Proportion




                                                                                                          Negative
                                                                                                          Neutral

             0.4                                                                                          Positive




             0.2




             0.0

                   0   1   2   3   4   5   6   7   8   9   10 11 12 13 14 15 16 17 18 19 20 21 22 23
Contradic3ons?	
  
       •  Tweets	
  that	
  are	
  nega3ve	
  according	
  to	
  one	
  
          score	
  but	
  posi3ve	
  according	
  to	
  another	
  
                                      subset(d,	
  swn	
  <	
  -­‐2	
  &	
  breen	
  >	
  1)	
  




"i	
  m	
  trying	
  to	
  get	
  some	
  legit	
  food	
  right	
  now	
  like	
  pizza	
  or	
  chicken	
  not	
  this	
  shi7y	
  ass	
  school	
  lunch”
	
  
"24	
  i	
  like	
  reading	
  25	
  i	
  hate	
  hopsin	
  26	
  i	
  love	
  chips	
  salsa	
  27	
  i	
  love	
  chevys	
  28	
  i	
  	
  
        	
  was	
  a	
  thug	
  in	
  middle	
  school	
  29	
  i	
  love	
  pizza”	
  
	
  
"@naturesempwm	
  had	
  a	
  raw	
  pizza	
  4	
  lunch	
  today	
  but	
  i	
  was	
  not	
  impressed	
  with	
  the	
  dried	
  out	
  	
  
        	
  not	
  fresh	
  vegetable	
  spring	
  roll	
  i	
  bought	
  threw	
  out	
  "	
  
Sen3ment	
  and	
  Geography	
  
•  What’s	
  the	
  spa3al	
  distribu3on	
  of	
  tweet	
  
   sen3ment?	
  
•  Extract	
  tweets	
  located	
  in	
  the	
  CONUS	
  (~	
  500)	
  
•  Visualize	
  the	
  direc3on	
  and	
  strength	
  of	
  
   sen3ments	
                                                            swn



•  Correlate	
  with	
  
                                                                                -1
                                                                                0
                                                                                1



   other	
  socio-­‐	
  
                                                                                2

                                                                          abs(swn)



   economic	
  factors?	
  
                                                                                0.0
                                                                                0.5
                                                                                1.0
                                                                                1.5
                                                                                2.0
Other	
  Considera3ons	
  
•  Should	
  take	
  into	
  account	
  nega3on	
  	
  
    –  Scan	
  for	
  nega3on	
  terms	
  and	
  adjust	
  score	
  
       appropriately	
  
•  Oblivious	
  to	
  sarcasm	
  
•  Sen3ment	
  scores	
  should	
  probably	
  be	
  modified	
  
   by	
  context	
  
•  Lots	
  of	
  M/L	
  opportuni3es	
  
    –  Spa3al	
  analysis	
  
    –  Topic	
  modeling	
  /	
  clustering	
  
    –  Predic3ve	
  models	
  

Weitere ähnliche Inhalte

Andere mochten auch

SUNG PARK PREDICT 422 Group Project Presentation
SUNG PARK PREDICT 422 Group Project PresentationSUNG PARK PREDICT 422 Group Project Presentation
SUNG PARK PREDICT 422 Group Project PresentationSung Park
 
R user group presentation
R user group presentationR user group presentation
R user group presentationTom Liptrot
 
Automatic extraction of microorganisms and their habitats from free text usin...
Automatic extraction of microorganisms and their habitats from free text usin...Automatic extraction of microorganisms and their habitats from free text usin...
Automatic extraction of microorganisms and their habitats from free text usin...Catherine Canevet
 
Twitter Hashtag #appleindia Text Mining using R
Twitter Hashtag #appleindia Text Mining using RTwitter Hashtag #appleindia Text Mining using R
Twitter Hashtag #appleindia Text Mining using RNikhil Gadkar
 
Computing Probabilities With R: mining the patterns in lottery
Computing Probabilities With R: mining the patterns in lotteryComputing Probabilities With R: mining the patterns in lottery
Computing Probabilities With R: mining the patterns in lotteryChia-Chi Chang
 
Text mining with R-studio
Text mining with R-studioText mining with R-studio
Text mining with R-studioAshley Lindley
 
My Data Analysis Portfolio (Text Mining)
My Data Analysis Portfolio (Text Mining)My Data Analysis Portfolio (Text Mining)
My Data Analysis Portfolio (Text Mining)Vincent Handara
 
Data mining with R- regression models
Data mining with R- regression modelsData mining with R- regression models
Data mining with R- regression modelsHamideh Iraj
 
Twitter Text Mining with Web scraping, R, Shiny and Hadoop - Richard Sheng
Twitter Text Mining with Web scraping, R, Shiny and Hadoop - Richard Sheng Twitter Text Mining with Web scraping, R, Shiny and Hadoop - Richard Sheng
Twitter Text Mining with Web scraping, R, Shiny and Hadoop - Richard Sheng Richard Sheng
 
Data Exploration and Visualization with R
Data Exploration and Visualization with RData Exploration and Visualization with R
Data Exploration and Visualization with RYanchang Zhao
 
Introduction to Data Mining with R and Data Import/Export in R
Introduction to Data Mining with R and Data Import/Export in RIntroduction to Data Mining with R and Data Import/Export in R
Introduction to Data Mining with R and Data Import/Export in RYanchang Zhao
 
hands on: Text Mining With R
hands on: Text Mining With Rhands on: Text Mining With R
hands on: Text Mining With RJahnab Kumar Deka
 
R Reference Card for Data Mining
R Reference Card for Data MiningR Reference Card for Data Mining
R Reference Card for Data MiningYanchang Zhao
 
An Introduction to Data Mining with R
An Introduction to Data Mining with RAn Introduction to Data Mining with R
An Introduction to Data Mining with RYanchang Zhao
 
THE 3V's OF BIG DATA: VARIETY, VELOCITY, AND VOLUME from Structure:Data 2012
THE 3V's OF BIG DATA: VARIETY, VELOCITY, AND VOLUME from Structure:Data 2012THE 3V's OF BIG DATA: VARIETY, VELOCITY, AND VOLUME from Structure:Data 2012
THE 3V's OF BIG DATA: VARIETY, VELOCITY, AND VOLUME from Structure:Data 2012Gigaom
 
Regression and Classification with R
Regression and Classification with RRegression and Classification with R
Regression and Classification with RYanchang Zhao
 
A short tutorial on r
A short tutorial on rA short tutorial on r
A short tutorial on rAshraf Uddin
 
Natural Language Processing in R (rNLP)
Natural Language Processing in R (rNLP)Natural Language Processing in R (rNLP)
Natural Language Processing in R (rNLP)fridolin.wild
 
Introduction to R for Data Mining (Feb 2013)
Introduction to R for Data Mining (Feb 2013)Introduction to R for Data Mining (Feb 2013)
Introduction to R for Data Mining (Feb 2013)Revolution Analytics
 

Andere mochten auch (20)

SUNG PARK PREDICT 422 Group Project Presentation
SUNG PARK PREDICT 422 Group Project PresentationSUNG PARK PREDICT 422 Group Project Presentation
SUNG PARK PREDICT 422 Group Project Presentation
 
R user group presentation
R user group presentationR user group presentation
R user group presentation
 
Automatic extraction of microorganisms and their habitats from free text usin...
Automatic extraction of microorganisms and their habitats from free text usin...Automatic extraction of microorganisms and their habitats from free text usin...
Automatic extraction of microorganisms and their habitats from free text usin...
 
Twitter Hashtag #appleindia Text Mining using R
Twitter Hashtag #appleindia Text Mining using RTwitter Hashtag #appleindia Text Mining using R
Twitter Hashtag #appleindia Text Mining using R
 
Computing Probabilities With R: mining the patterns in lottery
Computing Probabilities With R: mining the patterns in lotteryComputing Probabilities With R: mining the patterns in lottery
Computing Probabilities With R: mining the patterns in lottery
 
Text mining with R-studio
Text mining with R-studioText mining with R-studio
Text mining with R-studio
 
My Data Analysis Portfolio (Text Mining)
My Data Analysis Portfolio (Text Mining)My Data Analysis Portfolio (Text Mining)
My Data Analysis Portfolio (Text Mining)
 
Data mining with R- regression models
Data mining with R- regression modelsData mining with R- regression models
Data mining with R- regression models
 
Twitter Text Mining with Web scraping, R, Shiny and Hadoop - Richard Sheng
Twitter Text Mining with Web scraping, R, Shiny and Hadoop - Richard Sheng Twitter Text Mining with Web scraping, R, Shiny and Hadoop - Richard Sheng
Twitter Text Mining with Web scraping, R, Shiny and Hadoop - Richard Sheng
 
Data Exploration and Visualization with R
Data Exploration and Visualization with RData Exploration and Visualization with R
Data Exploration and Visualization with R
 
Introduction to Data Mining with R and Data Import/Export in R
Introduction to Data Mining with R and Data Import/Export in RIntroduction to Data Mining with R and Data Import/Export in R
Introduction to Data Mining with R and Data Import/Export in R
 
hands on: Text Mining With R
hands on: Text Mining With Rhands on: Text Mining With R
hands on: Text Mining With R
 
R Reference Card for Data Mining
R Reference Card for Data MiningR Reference Card for Data Mining
R Reference Card for Data Mining
 
An Introduction to Data Mining with R
An Introduction to Data Mining with RAn Introduction to Data Mining with R
An Introduction to Data Mining with R
 
THE 3V's OF BIG DATA: VARIETY, VELOCITY, AND VOLUME from Structure:Data 2012
THE 3V's OF BIG DATA: VARIETY, VELOCITY, AND VOLUME from Structure:Data 2012THE 3V's OF BIG DATA: VARIETY, VELOCITY, AND VOLUME from Structure:Data 2012
THE 3V's OF BIG DATA: VARIETY, VELOCITY, AND VOLUME from Structure:Data 2012
 
Regression and Classification with R
Regression and Classification with RRegression and Classification with R
Regression and Classification with R
 
A short tutorial on r
A short tutorial on rA short tutorial on r
A short tutorial on r
 
Natural Language Processing in R (rNLP)
Natural Language Processing in R (rNLP)Natural Language Processing in R (rNLP)
Natural Language Processing in R (rNLP)
 
Introduction to R for Data Mining (Feb 2013)
Introduction to R for Data Mining (Feb 2013)Introduction to R for Data Mining (Feb 2013)
Introduction to R for Data Mining (Feb 2013)
 
TextMining with R
TextMining with RTextMining with R
TextMining with R
 

Mehr von Rajarshi Guha

Pharos: A Torch to Use in Your Journey in the Dark Genome
Pharos: A Torch to Use in Your Journey in the Dark GenomePharos: A Torch to Use in Your Journey in the Dark Genome
Pharos: A Torch to Use in Your Journey in the Dark GenomeRajarshi Guha
 
Pharos: Putting targets in context
Pharos: Putting targets in contextPharos: Putting targets in context
Pharos: Putting targets in contextRajarshi Guha
 
Pharos – A Torch to Use in Your Journey In the Dark Genome
Pharos – A Torch to Use in Your Journey In the Dark GenomePharos – A Torch to Use in Your Journey In the Dark Genome
Pharos – A Torch to Use in Your Journey In the Dark GenomeRajarshi Guha
 
Pharos - Face of the KMC
Pharos - Face of the KMCPharos - Face of the KMC
Pharos - Face of the KMCRajarshi Guha
 
Enhancing Prioritization & Discovery of Novel Combinations using an HTS Platform
Enhancing Prioritization & Discovery of Novel Combinations using an HTS PlatformEnhancing Prioritization & Discovery of Novel Combinations using an HTS Platform
Enhancing Prioritization & Discovery of Novel Combinations using an HTS PlatformRajarshi Guha
 
What can your library do for you?
What can your library do for you?What can your library do for you?
What can your library do for you?Rajarshi Guha
 
So I have an SD File … What do I do next?
So I have an SD File … What do I do next?So I have an SD File … What do I do next?
So I have an SD File … What do I do next?Rajarshi Guha
 
Characterization of Chemical Libraries Using Scaffolds and Network Models
Characterization of Chemical Libraries Using Scaffolds and Network ModelsCharacterization of Chemical Libraries Using Scaffolds and Network Models
Characterization of Chemical Libraries Using Scaffolds and Network ModelsRajarshi Guha
 
From Data to Action : Bridging Chemistry and Biology with Informatics at NCATS
From Data to Action: Bridging Chemistry and Biology with Informatics at NCATSFrom Data to Action: Bridging Chemistry and Biology with Informatics at NCATS
From Data to Action : Bridging Chemistry and Biology with Informatics at NCATSRajarshi Guha
 
Robots, Small Molecules & R
Robots, Small Molecules & RRobots, Small Molecules & R
Robots, Small Molecules & RRajarshi Guha
 
Fingerprinting Chemical Structures
Fingerprinting Chemical StructuresFingerprinting Chemical Structures
Fingerprinting Chemical StructuresRajarshi Guha
 
Exploring Compound Combinations in High Throughput Settings: Going Beyond 1D...
Exploring Compound Combinations in High Throughput Settings: Going Beyond 1D...Exploring Compound Combinations in High Throughput Settings: Going Beyond 1D...
Exploring Compound Combinations in High Throughput Settings: Going Beyond 1D...Rajarshi Guha
 
When the whole is better than the parts
When the whole is better than the partsWhen the whole is better than the parts
When the whole is better than the partsRajarshi Guha
 
Exploring Compound Combinations in High Throughput Settings: Going Beyond 1D ...
Exploring Compound Combinations in High Throughput Settings: Going Beyond 1D ...Exploring Compound Combinations in High Throughput Settings: Going Beyond 1D ...
Exploring Compound Combinations in High Throughput Settings: Going Beyond 1D ...Rajarshi Guha
 
Pushing Chemical Biology Through the Pipes
Pushing Chemical Biology Through the PipesPushing Chemical Biology Through the Pipes
Pushing Chemical Biology Through the PipesRajarshi Guha
 
Characterization and visualization of compound combination responses in a hig...
Characterization and visualization of compound combination responses in a hig...Characterization and visualization of compound combination responses in a hig...
Characterization and visualization of compound combination responses in a hig...Rajarshi Guha
 
The BioAssay Research Database
The BioAssay Research DatabaseThe BioAssay Research Database
The BioAssay Research DatabaseRajarshi Guha
 
Cloudy with a Touch of Cheminformatics
Cloudy with a Touch of CheminformaticsCloudy with a Touch of Cheminformatics
Cloudy with a Touch of CheminformaticsRajarshi Guha
 
Chemical Data Mining: Open Source & Reproducible
Chemical Data Mining: Open Source & ReproducibleChemical Data Mining: Open Source & Reproducible
Chemical Data Mining: Open Source & ReproducibleRajarshi Guha
 
Chemogenomics in the cloud: Is the sky the limit?
Chemogenomics in the cloud: Is the sky the limit?Chemogenomics in the cloud: Is the sky the limit?
Chemogenomics in the cloud: Is the sky the limit?Rajarshi Guha
 

Mehr von Rajarshi Guha (20)

Pharos: A Torch to Use in Your Journey in the Dark Genome
Pharos: A Torch to Use in Your Journey in the Dark GenomePharos: A Torch to Use in Your Journey in the Dark Genome
Pharos: A Torch to Use in Your Journey in the Dark Genome
 
Pharos: Putting targets in context
Pharos: Putting targets in contextPharos: Putting targets in context
Pharos: Putting targets in context
 
Pharos – A Torch to Use in Your Journey In the Dark Genome
Pharos – A Torch to Use in Your Journey In the Dark GenomePharos – A Torch to Use in Your Journey In the Dark Genome
Pharos – A Torch to Use in Your Journey In the Dark Genome
 
Pharos - Face of the KMC
Pharos - Face of the KMCPharos - Face of the KMC
Pharos - Face of the KMC
 
Enhancing Prioritization & Discovery of Novel Combinations using an HTS Platform
Enhancing Prioritization & Discovery of Novel Combinations using an HTS PlatformEnhancing Prioritization & Discovery of Novel Combinations using an HTS Platform
Enhancing Prioritization & Discovery of Novel Combinations using an HTS Platform
 
What can your library do for you?
What can your library do for you?What can your library do for you?
What can your library do for you?
 
So I have an SD File … What do I do next?
So I have an SD File … What do I do next?So I have an SD File … What do I do next?
So I have an SD File … What do I do next?
 
Characterization of Chemical Libraries Using Scaffolds and Network Models
Characterization of Chemical Libraries Using Scaffolds and Network ModelsCharacterization of Chemical Libraries Using Scaffolds and Network Models
Characterization of Chemical Libraries Using Scaffolds and Network Models
 
From Data to Action : Bridging Chemistry and Biology with Informatics at NCATS
From Data to Action: Bridging Chemistry and Biology with Informatics at NCATSFrom Data to Action: Bridging Chemistry and Biology with Informatics at NCATS
From Data to Action : Bridging Chemistry and Biology with Informatics at NCATS
 
Robots, Small Molecules & R
Robots, Small Molecules & RRobots, Small Molecules & R
Robots, Small Molecules & R
 
Fingerprinting Chemical Structures
Fingerprinting Chemical StructuresFingerprinting Chemical Structures
Fingerprinting Chemical Structures
 
Exploring Compound Combinations in High Throughput Settings: Going Beyond 1D...
Exploring Compound Combinations in High Throughput Settings: Going Beyond 1D...Exploring Compound Combinations in High Throughput Settings: Going Beyond 1D...
Exploring Compound Combinations in High Throughput Settings: Going Beyond 1D...
 
When the whole is better than the parts
When the whole is better than the partsWhen the whole is better than the parts
When the whole is better than the parts
 
Exploring Compound Combinations in High Throughput Settings: Going Beyond 1D ...
Exploring Compound Combinations in High Throughput Settings: Going Beyond 1D ...Exploring Compound Combinations in High Throughput Settings: Going Beyond 1D ...
Exploring Compound Combinations in High Throughput Settings: Going Beyond 1D ...
 
Pushing Chemical Biology Through the Pipes
Pushing Chemical Biology Through the PipesPushing Chemical Biology Through the Pipes
Pushing Chemical Biology Through the Pipes
 
Characterization and visualization of compound combination responses in a hig...
Characterization and visualization of compound combination responses in a hig...Characterization and visualization of compound combination responses in a hig...
Characterization and visualization of compound combination responses in a hig...
 
The BioAssay Research Database
The BioAssay Research DatabaseThe BioAssay Research Database
The BioAssay Research Database
 
Cloudy with a Touch of Cheminformatics
Cloudy with a Touch of CheminformaticsCloudy with a Touch of Cheminformatics
Cloudy with a Touch of Cheminformatics
 
Chemical Data Mining: Open Source & Reproducible
Chemical Data Mining: Open Source & ReproducibleChemical Data Mining: Open Source & Reproducible
Chemical Data Mining: Open Source & Reproducible
 
Chemogenomics in the cloud: Is the sky the limit?
Chemogenomics in the cloud: Is the sky the limit?Chemogenomics in the cloud: Is the sky the limit?
Chemogenomics in the cloud: Is the sky the limit?
 

Kürzlich hochgeladen

My Hashitalk Indonesia April 2024 Presentation
My Hashitalk Indonesia April 2024 PresentationMy Hashitalk Indonesia April 2024 Presentation
My Hashitalk Indonesia April 2024 PresentationRidwan Fadjar
 
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024BookNet Canada
 
Ensuring Technical Readiness For Copilot in Microsoft 365
Ensuring Technical Readiness For Copilot in Microsoft 365Ensuring Technical Readiness For Copilot in Microsoft 365
Ensuring Technical Readiness For Copilot in Microsoft 3652toLead Limited
 
Human Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsHuman Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsMark Billinghurst
 
Vertex AI Gemini Prompt Engineering Tips
Vertex AI Gemini Prompt Engineering TipsVertex AI Gemini Prompt Engineering Tips
Vertex AI Gemini Prompt Engineering TipsMiki Katsuragi
 
Powerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time ClashPowerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time Clashcharlottematthew16
 
Story boards and shot lists for my a level piece
Story boards and shot lists for my a level pieceStory boards and shot lists for my a level piece
Story boards and shot lists for my a level piececharlottematthew16
 
The Future of Software Development - Devin AI Innovative Approach.pdf
The Future of Software Development - Devin AI Innovative Approach.pdfThe Future of Software Development - Devin AI Innovative Approach.pdf
The Future of Software Development - Devin AI Innovative Approach.pdfSeasiaInfotech2
 
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks..."LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...Fwdays
 
Dev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebDev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebUiPathCommunity
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek SchlawackFwdays
 
"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr Bagan"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr BaganFwdays
 
Search Engine Optimization SEO PDF for 2024.pdf
Search Engine Optimization SEO PDF for 2024.pdfSearch Engine Optimization SEO PDF for 2024.pdf
Search Engine Optimization SEO PDF for 2024.pdfRankYa
 
Training state-of-the-art general text embedding
Training state-of-the-art general text embeddingTraining state-of-the-art general text embedding
Training state-of-the-art general text embeddingZilliz
 
Bun (KitWorks Team Study 노별마루 발표 2024.4.22)
Bun (KitWorks Team Study 노별마루 발표 2024.4.22)Bun (KitWorks Team Study 노별마루 발표 2024.4.22)
Bun (KitWorks Team Study 노별마루 발표 2024.4.22)Wonjun Hwang
 
DevoxxFR 2024 Reproducible Builds with Apache Maven
DevoxxFR 2024 Reproducible Builds with Apache MavenDevoxxFR 2024 Reproducible Builds with Apache Maven
DevoxxFR 2024 Reproducible Builds with Apache MavenHervé Boutemy
 
SAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxSAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxNavinnSomaal
 
Commit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easyCommit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easyAlfredo García Lavilla
 
Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?Mattias Andersson
 
CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):comworks
 

Kürzlich hochgeladen (20)

My Hashitalk Indonesia April 2024 Presentation
My Hashitalk Indonesia April 2024 PresentationMy Hashitalk Indonesia April 2024 Presentation
My Hashitalk Indonesia April 2024 Presentation
 
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
 
Ensuring Technical Readiness For Copilot in Microsoft 365
Ensuring Technical Readiness For Copilot in Microsoft 365Ensuring Technical Readiness For Copilot in Microsoft 365
Ensuring Technical Readiness For Copilot in Microsoft 365
 
Human Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR SystemsHuman Factors of XR: Using Human Factors to Design XR Systems
Human Factors of XR: Using Human Factors to Design XR Systems
 
Vertex AI Gemini Prompt Engineering Tips
Vertex AI Gemini Prompt Engineering TipsVertex AI Gemini Prompt Engineering Tips
Vertex AI Gemini Prompt Engineering Tips
 
Powerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time ClashPowerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time Clash
 
Story boards and shot lists for my a level piece
Story boards and shot lists for my a level pieceStory boards and shot lists for my a level piece
Story boards and shot lists for my a level piece
 
The Future of Software Development - Devin AI Innovative Approach.pdf
The Future of Software Development - Devin AI Innovative Approach.pdfThe Future of Software Development - Devin AI Innovative Approach.pdf
The Future of Software Development - Devin AI Innovative Approach.pdf
 
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks..."LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
"LLMs for Python Engineers: Advanced Data Analysis and Semantic Kernel",Oleks...
 
Dev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio WebDev Dives: Streamline document processing with UiPath Studio Web
Dev Dives: Streamline document processing with UiPath Studio Web
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
 
"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr Bagan"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr Bagan
 
Search Engine Optimization SEO PDF for 2024.pdf
Search Engine Optimization SEO PDF for 2024.pdfSearch Engine Optimization SEO PDF for 2024.pdf
Search Engine Optimization SEO PDF for 2024.pdf
 
Training state-of-the-art general text embedding
Training state-of-the-art general text embeddingTraining state-of-the-art general text embedding
Training state-of-the-art general text embedding
 
Bun (KitWorks Team Study 노별마루 발표 2024.4.22)
Bun (KitWorks Team Study 노별마루 발표 2024.4.22)Bun (KitWorks Team Study 노별마루 발표 2024.4.22)
Bun (KitWorks Team Study 노별마루 발표 2024.4.22)
 
DevoxxFR 2024 Reproducible Builds with Apache Maven
DevoxxFR 2024 Reproducible Builds with Apache MavenDevoxxFR 2024 Reproducible Builds with Apache Maven
DevoxxFR 2024 Reproducible Builds with Apache Maven
 
SAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxSAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptx
 
Commit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easyCommit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easy
 
Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?
 
CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):
 

Quantifying Text Sentiment in R Using Dictionaries and Scoring Functions

  • 1. Happy,  Sad,  Indifferent  …   Quan3fying  Text  Sen3ment  in  R   Rajarshi  Guha     CT  R  Users  Group   May  2012  
  • 2. Preamble   •  hHps://github.com/rajarshi/ctrug-­‐tweet   •  Focus  is  on  using  R  to  perform  this  task   •  Won’t  comment  on  validity,  rigor,  u3lity,  …  of   sen3ment  analysis  methods   •  Some  of  the  example  data  is  available  freely,   other  parts  available  on  request    
  • 3. GeUng  TwiHer  Data   •  Based  on  a  collabora3on  with  Prof.  Debs  Ghosh   (Uconn),  studying  obesity  &  social  media   •  Accessing  TwiHer  is  easy  using  many  languages   –  We  obtained  tweets  via  a  PHP  client  running  over  an   extended  period  of  3me   –  Ended  up  with  108,164  tweets   •  Won’t  focus  on  accessing  TwiHer  data  from  R   –  Very  straighaorward  with  twitteR  
  • 4. Cleaning  Text   •  Load  in  tweet  data,  get  rid  of  urls,  HTML   escape  codes,  punctua3on  etc   d  <-­‐  read.csv('pizza-­‐unique.csv',  colClass='character',                                comment='',  header=TRUE)   d$geox  <-­‐  as.numeric(d$geox)   d$geoy  <-­‐  as.numeric(d$geoy)     remove.urls  <-­‐  function(x)  gsub("http.*$",  "",  gsub('http.*s',  '  ',  x))   remove.html  <-­‐  function(x)  gsub('&quot;',  '',  x)     d$text  <-­‐  remove.urls(d$text)   d$text  <-­‐  remove.html(d$text)   d$text  <-­‐  gsub("@",  "FOOBAZ",  d$text)   d$text  <-­‐  gsub("[[:punct:]]+",  "  ",  d$text)   d$text  <-­‐  gsub("FOOBAZ",  "@",  d$text)   d$text  <-­‐  gsub("[[:space:]]+",  '  ',  d$text)   d$text  <-­‐  tolower(d$text)  
  • 5. Quan3fying  Sen3ment   •  Based  on  iden3fying  words  with  posi3ve  or   nega3ve  connota3ons   •  Fundamentally  based  on  looking  up  words   from  a  dic3onary   •  If  a  tweet  has  more  posi3ve  words  than   nega3ve  words,  the  tweet  is  posi3ve   •  More  sophis3cated  scoring  schemes  are   possible  
  • 6. BeHer  Dic3onaries?   •  Sen3WordNet   –  Derived  from  WordNet,  each  term  is  assigned  a   posi3vity  and  nega3vity  score   –  206K  terms   –  Converted  to  simple     1.0 CSV  for  easy  import     0.8 into  R   0.6 Sentiment Proportion •  Ideally,  should     negative neutral positive 0.4 perform  POS  tagging   0.2 0.0 adjective adverb noun verb
  • 7. Scoring  Tweets   •  Given  a  scoring  func3on,  we  can  process  the   tweets   swn  <-­‐  read.csv('sentinet_r.csv',  header=TRUE,     –  Perfect  use                                  as.is=TRUE)     case  for     swn.match  <-­‐  function(w)  {   parallel        tmp  <-­‐  subset(swn,  Term  ==  w)      if  (nrow(tmp)  >=  1)  return(tmp[1,c(3,4)])   processing      else  return(c(0,0))   }   –  Easily  switch     out  the     score.swn  <-­‐  function(tweet)  {      words  <-­‐  strsplit(tweet,  "s+")[[1]]   scoring        cs  <-­‐  colSums(do.call('rbind',   func3on                                                  lapply(words,  function(z)                                                                  swn.match(z))))      return(cs[1]-­‐cs[2])   }     scores  <-­‐  mclapply(d$text,  score.swn)  
  • 8.
  • 9. Profiling  Makes  Me  Happy   swn.match  <-­‐  function(w)  {   •  6052  sec  with      tmp  <-­‐  subset(swn,  Term  ==  w)      if  (nrow(tmp)  >=  1)  return(tmp[1,c(3,4)])   24  cores      else  return(c(0,0))   }   •  Rprof()  is  a     score.swn  <-­‐  function(tweet)  {      words  <-­‐  strsplit(tweet,  "s+")[[1]]   good  way  to      cs  <-­‐  colSums(do.call('rbind',                                                  lapply(words,  function(z)                                                                  swn.match(z))))   iden3fy        return(cs[1]-­‐cs[2])   boHlenecks*   }     score.swn.2  <-­‐  function(tweet)  {   •  461  sec  with      words  <-­‐  strsplit(tweet,  "s+")[[1]]      rows  <-­‐  match(words,  swn$Term)   24  cores      rows  <-­‐  rows[!is.na(rows)]      cs  <-­‐  colSums(swn[rows,c(3,4)])      return(cs[1]-­‐cs[2])       }     *  overkill  for  this  example  
  • 10. Looking  at  the  Scores   •  Bulk  of  the  tweets   2.5 are  neutral   2.0 •  Similar  behavior   Method density 1.5 SWN from  either     Breen 1.0 scoring  func3on   0.5 0.0 -6 -4 -2 0 2 4 6 Sentiment Scores d$swn  <-­‐  unlist(scores.swn)   d$breen  <-­‐  unlist(scores.breen)     tmp  <-­‐  rbind(data.frame(Method='SWN',  Scores=d$swn),                            data.frame(Method='Breen',  Scores=d$breen))   ggplot(tmp,  aes(x=Scores,  fill=Method))  +      geom_density(alpha=0.25)  +      xlab("Sentiment  Scores")  
  • 11. Sen3ment  &  Time  of  Day   •  Group  tweets  by  hour  and  evaluate  how   propor3ons  of  posi3ve,  nega3ve,  etc  vary  .   tmp  <-­‐  d   tmp$hour  <-­‐  strptime(d$time,  format='%a,  %d  %b  %Y  %H:%M')$hour     tmp  <-­‐  subset(tmp,  !is.na(swn))   tmp$status  <-­‐  sapply(tmp$swn,  function(x)  {      if  (x  >  0)  return("Positive")      else  if  (x  <  0)  return("Negative")      else  return("Neutral")   })     tmp  <-­‐  data.frame(do.call('rbind',                              by(tmp,  tmp$hour,  function(x)  table(x$status))))   tmp$Hour  <-­‐  factor(rownames(tmp),  levels=0:23)   tmp  <-­‐  melt(tmp,  id='Hour',  variable_name='Sentiment')   ggplot(tmp,  aes(x=Hour,y=value,fill=Sentiment))+geom_bar(position='fill')+      xlab("")+ylab("Proportion")    
  • 12. Sen3ment  &  Time  of  Day   1.0 0.8 0.6 Sentiment Proportion Negative Neutral 0.4 Positive 0.2 0.0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
  • 13. Contradic3ons?   •  Tweets  that  are  nega3ve  according  to  one   score  but  posi3ve  according  to  another   subset(d,  swn  <  -­‐2  &  breen  >  1)   "i  m  trying  to  get  some  legit  food  right  now  like  pizza  or  chicken  not  this  shi7y  ass  school  lunch”   "24  i  like  reading  25  i  hate  hopsin  26  i  love  chips  salsa  27  i  love  chevys  28  i      was  a  thug  in  middle  school  29  i  love  pizza”     "@naturesempwm  had  a  raw  pizza  4  lunch  today  but  i  was  not  impressed  with  the  dried  out      not  fresh  vegetable  spring  roll  i  bought  threw  out  "  
  • 14. Sen3ment  and  Geography   •  What’s  the  spa3al  distribu3on  of  tweet   sen3ment?   •  Extract  tweets  located  in  the  CONUS  (~  500)   •  Visualize  the  direc3on  and  strength  of   sen3ments   swn •  Correlate  with   -1 0 1 other  socio-­‐   2 abs(swn) economic  factors?   0.0 0.5 1.0 1.5 2.0
  • 15. Other  Considera3ons   •  Should  take  into  account  nega3on     –  Scan  for  nega3on  terms  and  adjust  score   appropriately   •  Oblivious  to  sarcasm   •  Sen3ment  scores  should  probably  be  modified   by  context   •  Lots  of  M/L  opportuni3es   –  Spa3al  analysis   –  Topic  modeling  /  clustering   –  Predic3ve  models