Diese Präsentation wurde erfolgreich gemeldet.
Wir verwenden Ihre LinkedIn Profilangaben und Informationen zu Ihren Aktivitäten, um Anzeigen zu personalisieren und Ihnen relevantere Inhalte anzuzeigen. Sie können Ihre Anzeigeneinstellungen jederzeit ändern.

令和から本気出す

1.035 Aufrufe

Veröffentlicht am

Rで読書Hack

Veröffentlicht in: Daten & Analysen
  • Loggen Sie sich ein, um Kommentare anzuzeigen.

令和から本気出す

  1. 1. > me $name [1] "Takashi Kitano" $twitter [1] "@kashitan" $work_in [1] " " -> " ( 6/30)"
  2. 2. 新しい時代が 始まって2ヶ⽉ みなさまいかが お過ごしでしょうか?
  3. 3. ⼼機⼀転した⽅も 多いのでは
  4. 4. そうだ Rで読書Hack しよう
  5. 5. titles <- c(" ", " ", " ", " ", "7 ") books <- dplyr::tibble( docId = as.character(1:length(titles)), title = titles, author = c(rep(" ", 2), rep(" F ", 2), " R "))
  6. 6. books %>% knitr::kable() docId title author 1 2 3 F 4 F 5 7 R
  7. 7. sentDf %>% head() %>% knitr::kable() docId sectionId sentenceId sentence 1 1_0006 1_0006_0001 ── 1 1_0006 1_0006_0002 1 1_0006 1_0006_0003 1 1_0006 1_0006_0004 1 1_0006 1_0006_0005 1 1_0006 1_0006_0006
  8. 8. tokenDf <- sentDf %>% as.data.frame() %>% RMeCab::RMeCabDF("sentence", 1) %>% purrr::set_names(nm = sentDf$sentenceId) %>% purrr::map2_dfr(.x = ., .y = names(.), .f = function(x, y) { tibble(docId = stringr::str_replace(y, "_.*", ""), sectionId = stringr::str_replace(y, "(.+_.*)_.*", "1"), sentenceId = y, token = x, hinshi = names(x)) })
  9. 9. docId sectionId sentenceId token hinshi 1 1_0006 1_0006_0001 1 1_0006 1_0006_0001 ─ 1 1_0006 1_0006_0001 ─ 1 1_0006 1_0006_0001 1 1_0006 1_0006_0001 1 1_0006 1_0006_0001 1 1_0006 1_0006_0002 1 1_0006 1_0006_0002 1 1_0006 1_0006_0002 1 1_0006 1_0006_0002 tokenDf %>% head(n = 10) %>% knitr::kable()
  10. 10. https://www.intage.co.jp/glossary/400/
  11. 11. https://www.gastonsanchez.com/visually-enforced/how-to/2012/07/19/Correspondence-Analysis/
  12. 12. https://www.gastonsanchez.com/visually-enforced/how-to/2012/07/19/Correspondence-Analysis/
  13. 13. https://youtu.be/dE10fBCDWQc
  14. 14. sw <- c(" ", " ", " ", " ", " ", " ", " ", " ", " ", " ", " ") CA_in <- tokenDf %>% dplyr::filter(hinshi %in% c(" ", " ")) %>% dplyr::filter(! token %in% sw) %>% dplyr::count(docId, token) %>% dplyr::ungroup() %>% dplyr::inner_join(books, by = "docId") %>% dplyr::select(token, title, n) %>% tidyr::spread(key = title, value=n, fill = 0) %>% dplyr::mutate(max = pmax(!!!rlang::syms(titles))) %>% dplyr::top_n(n = 60, wt = max) %>% as.data.frame()
  15. 15. rownames(CA_in) <- CA_in$token CA_in <- CA_in %>% select(one_of(titles)) CA_in %>% head() %>% knitr::kable() 7 0 1 0 0 269 17 15 166 431 110 0 0 0 0 444 148 116 1 15 538 103 71 69 94 246 182 175 87 87 236
  16. 16. FactoMineR::CA(CA_in)
  17. 17. 『対応分析⼊⾨』の著者・藤本先⽣が issue上げるも2年近く放置。。。
  18. 18. d <- dplyr::tibble( x = CA_out$row$coord[, 1], y = CA_out$row$coord[, 2], label = rownames(CA_out$row$coord), type = "row" ) %>% dplyr::bind_rows(tibble( x = CA_out$col$coord[, 1], y = CA_out$col$coord[, 2], label = rownames(CA_out$col$coord), type = "col" ))
  19. 19. d %>% head(n = 12) %>% knitr::kable() x y label type -0.4651474 -1.4411540 row 0.9067268 0.1419623 row -0.4639592 -1.4488050 row -0.5503475 -0.6609927 row -0.0656860 -0.2595280 row -0.2077013 0.0100741 row 0.0335994 0.0251759 row -0.5759668 0.0906504 row -0.1080142 0.2464559 row -0.0559726 0.0791701 row
  20. 20. labels <- glue::glue("Dim {axis} ({format(var, digits = 3, nsmall = 1)} %)", axis = c(1, 2), var = CA_out$eig[1:2, 2]) d %>% ggplot2::ggplot(aes(x = x, y = y, label = label, shape = type, colour = type)) + ggplot2::geom_vline(xintercept = 0, linetype = "dashed") + ggplot2::geom_hline(yintercept = 0, linetype = "dashed") + ggplot2::geom_point() + ggrepel::geom_text_repel(family = "HiraMaruProN-W4") + ggplot2::scale_shape_manual(values = c(17, 16)) + ggplot2::xlab(labels[1]) + ggplot2::ylab(labels[2]) + ggplot2::ggtitle("CA - Biplot") + ggplot2::theme(legend.position = "none")
  21. 21. TF-IDFで 対応分析しよう
  22. 22. CA_in <- tokenDf %>% dplyr::filter(hinshi %in% c(" ", " ")) %>% dplyr::filter(! token %in% sw) %>% dplyr::count(docId, token) %>% dplyr::ungroup() %>% tidytext::bind_tf_idf(term = token, document = docId, n = n) %>% dplyr::inner_join(books, by = "docId") %>% dplyr::select(token, title, tf_idf) %>% tidyr::spread(key = title, value=tf_idf, fill = 0.0) %>% dplyr::mutate(tfidf_max = pmax(!!!rlang::syms(titles))) %>% dplyr::top_n(n = 60, wt = tfidf_max) %>% as.data.frame()
  23. 23. rownames(CA_in) <- CA_in$token CA_in <- CA_in %>% select(one_of(titles)) CA_in %>% head() %>% knitr::kable() 7 0.00E+00 0.0000224 0 0.00E+00 0.0036436 5.91E-05 0.0019978 0 0.00E+00 0.0000151 5.91E-05 0.0019853 0 0.00E+00 0.0000151 0.00E+00 0.0000000 0 0.00E+00 0.0014751 Ⅱ 0.00E+00 0.0000000 0 2.49E-05 0.0008940 0.00E+00 0.0000000 0 0.00E+00 0.0025933
  24. 24.
  25. 25. トピックモデル で分析してみよう
  26. 26. https://www.albert2005.co.jp/knowledge/machine_learning/topic_model/about_topic_model
  27. 27. https://youtu.be/dE10fBCDWQc
  28. 28. LDA_in <- tokenDf %>% dplyr::filter(hinshi %in% c(" ", " ")) %>% dplyr::filter(! token %in% sw) %>% dplyr::group_by(docId, token) %>% dplyr::count(token) %>% dplyr::ungroup() %>% dplyr::group_by(token) %>% dplyr::mutate(total = sum(n)) %>% dplyr::ungroup() %>% dplyr::top_n(n = 5000, wt = total) %>% dplyr::inner_join(books, by = "docId") %>% tidytext::cast_dtm(title, token, n)
  29. 29. LDA_out <- topicmodels::LDA(LDA_in, k = 5, control = list(seed = 123)) LDA_out %>% tidytext::tidy(matrix = "gamma") %>% ggplot2::ggplot(aes(factor(topic), gamma)) + ggplot2::geom_boxplot() + ggplot2::facet_wrap(~ document) + ggplot2::theme_minimal(base_family = "HiraMaruProN-W4")
  30. 30. LDA_out %>% tidytext::tidy() %>% dplyr::group_by(topic) %>% dplyr::top_n(n = 10, wt = beta) %>% dplyr::ungroup() %>% dplyr::mutate(term = reorder(term, beta)) %>% ggplot2::ggplot(aes(term, beta, fill = factor(topic))) + ggplot2::geom_col(show.legend = FALSE) + ggplot2::facet_wrap(~ topic, scales = "free_y") + ggplot2::coord_flip() + ggplot2::theme_minimal(base_family = "HiraMaruProN-W4")
  31. 31.
  32. 32. https://logmi.jp/business/articles/156592
  33. 33. ⽂章要約しよう
  34. 34. https://qiita.com/icoxfog417/items/d06651db10e27220c819
  35. 35. https://qiita.com/icoxfog417/items/d06651db10e27220c819
  36. 36. tokenDf <- tokenDf %>% dplyr::filter(docId == "1") %>% dplyr::group_by(sectionId) %>% tidyr::nest() tokenDf %>% head() # A tibble: 6 x 2 sectionId data <chr> <list> 1 1_0006 <tibble [2,642 × 4]> 2 1_0007 <tibble [1,432 × 4]> 3 1_0009 <tibble [5,928 × 4]> 4 1_0010 <tibble [3,755 × 4]> 5 1_0011 <tibble [5,135 × 4]> 6 1_0013 <tibble [3,440 × 4]>
  37. 37. tokenDf$data[[1]] %>% head() %>% knitr::kable() docId sentenceId token hinshi 1 1_0006_0001 1 1_0006_0001 ─ 1 1_0006_0001 ─ 1 1_0006_0001 1 1_0006_0001 1 1_0006_0001 tokenDf$data[[2]] %>% head() %>% knitr::kable() docId sentenceId token hinshi 1 1_0007_0001 1 1_0007_0001 1 1_0007_0001 1 1_0007_0001 1 1_0007_0001 1 1_0007_0001
  38. 38. simDf <- tokenDf %>% # lexRankr::sentenceSimil() dplyr::mutate(sentenceId = purrr::map(.$data, ~.$sentenceId), token = purrr::map(.$data, ~.$token), docId = purrr::map2(sectionId, token, function(x, y) { rep(x, length(y)) } )) %>% # dplyr::mutate(simil = purrr::pmap(list(sentenceId, token, docId), lexRankr::sentenceSimil)) %>% select(sectionId, simil)
  39. 39. simDf %>% head() # A tibble: 6 x 2 sectionId simil <chr> <list> 1 1_0006 <df[,3] [5,671 × 3]> 2 1_0007 <df[,3] [2,080 × 3]> 3 1_0009 <df[,3] [39,340 × 3]> 4 1_0010 <df[,3] [13,695 × 3]> 5 1_0011 <df[,3] [25,651 × 3]> 6 1_0013 <df[,3] [12,880 × 3]>
  40. 40. simDf$simil[[1]] %>% head() %>% knitr::kable() sent1 sent2 similVal 1_0006_0001 1_0006_0002 0.5045201 1_0006_0001 1_0006_0003 0.4682931 1_0006_0001 1_0006_0004 0.0000000 1_0006_0001 1_0006_0005 0.5541319 1_0006_0001 1_0006_0006 0.5856045 1_0006_0001 1_0006_0007 0.4752808
  41. 41. topNSents <- simDf %>% # lexRankr::lexRankFromSimil dplyr::mutate(s1 = purrr::map(.$simil, ~.$sent1), s2 = purrr::map(.$simil, ~.$sent2), simil = purrr::map(.$simil, ~.$similVal), n = purrr::map(.$sentenceId, function(x) { as.integer(ceiling(dplyr::n_distinct(x) * 0.1)) })) %>% # dplyr::mutate(topN = purrr::pmap(list(s1, s2, simil, n), lexRankr::lexRankFromSimil, threshold = 0.2, continuous = TRUE))
  42. 42. topNSents %>% head() # A tibble: 6 x 2 sectionId topN <chr> <list> 1 1_0006 <df[,2] [11 × 2]> 2 1_0007 <df[,2] [7 × 2]> 3 1_0009 <df[,2] [29 × 2]> 4 1_0010 <df[,2] [17 × 2]> 5 1_0011 <df[,2] [23 × 2]> 6 1_0013 <df[,2] [17 × 2]>
  43. 43. topNSents$topN[[1]] %>% knitr::kable() sentenceId value 1_0006_0012 0.0110008 1_0006_0025 0.0110065 1_0006_0027 0.0109621 1_0006_0028 0.0110488 1_0006_0033 0.0109785 1_0006_0040 0.0110083 1_0006_0041 0.0110468 1_0006_0047 0.0110727 1_0006_0059 0.0110430 1_0006_0085 0.0110101 1_0006_0086 0.0109732
  44. 44. res <- topNSents %>% select(topN) %>% tidyr::unnest() %>% dplyr::inner_join(sentDf, by = “sentenceId") res %>% head() %>% knitr::kable() sentenceI d value docI d section Id sentence 1_0006_0 012 0.0110008 1 1_0006 1_0006_0 025 0.0110065 1 1_0006 1_0006_0 027 0.0109621 1 1_0006 1_0006_0 028 0.0110488 1 1_0006 1_0006_0 033 0.0109785 1 1_0006 1_0006_0 040 0.0110083 1 1_0006 ──
  45. 45. sentDf %>% filter(docId == "1") %>% group_by(sectionId) %>% count() # A tibble: 31 x 2 # Groups: sectionId [31] sectionId n <chr> <int> 1 1_0006 107 2 1_0007 65 3 1_0009 281 4 1_0010 166 5 1_0011 227 6 1_0013 161 res %>% filter(docId == "1") %>% group_by(sectionId) %>% count() # A tibble: 31 x 2 # Groups: sectionId [31] sectionId n <chr> <int> 1 1_0006 11 2 1_0007 7 3 1_0009 29 4 1_0010 17 5 1_0011 23 6 1_0013 17

×