3章 その4 単語頻出表の読み込み

pythonのstring.stripは指定文字を削除する関数で、引数無しだと文字列前後の空白を除去する。rubyにも引数なしのstripがあって、同じ働きをする



p.37より、cluster.rbを作成して前回generatefeedvector.rbで作成した単語頻出表(myblogdata.txt)から単語名などを取り出し、ruby上のオブジェクトにする。


http://www.bitbucket.org/shokai/collective-intelligence-study/src/82884830d372/03/clusters.rb

#!/usr/bin/ruby

require 'rubygems'
require 'pp'

class Clusters
    
  def readline(filename)
    lines = Array.new
    open(filename).each{ |line|
      lines.push(line)
    }
    
    # 最初の行は列のタイトル(単語名)
    colnames = lines[0].strip().split("\t")
    colnames.shift # 最初の1つを捨てる
    
    # blog名と単語数
    rownames = Array.new
    data = Array.new
    lines[1...lines.length].each{ |line|
      tmp = line.strip().split("\t")
      # それぞれの行の最初の列は行の名前(blog名)
      rownames.push(tmp.shift)
      # 行の残りの部分がその行のデータ
      wordcount = Array.new
      tmp.each{ |c|
        wordcount.push(c.to_i)
      }
      data.push(wordcount)
    }

    return rownames,colnames,data
  end
  
end


使ってみる。blog名リスト

cs = Clusters.new
rownames,colnames,data = cs.readline('myblogdata.txt')
p rownames
["Micro Persuasion", "Creating Passionate Users", "Kotaku", "GoFugYourself", "CoolerHeads Prevail", "The Superficial - Because You're Ugly", "ongoing", "SpikedHumor - Today's Videos and Pictures", "Deadspin", "Signal vs. Noise", "The Blotter", "Online Marketing Report", "SimpleBits", "Quick Online Tips", "Celebrity gossip juicy celebrity rumors Hollywood gossip blog from Perez Hilton", "Joel on Software", "NewsBusters.org - Exposing Liberal Media Bias", "Pharyngula", "Bloggers Blog: Blogging the Blogsphere", "Publishing 2.0", "How to Change the World", "Joystiq", "Valleywag", "Engadget", "The Daily Dish | By Andrew Sullivan", "Lifehacker", "Stepcase Lifehack", "Copyblogger", "Techdirt", "Search Engine Roundtable", "Power Line", "Scobleizer -- Tech geek blogger", "Joho the Blog", "Joi Ito's Web", "The Official Google Blog", "we make money not art", "Shoemoney - Skills To Pay The Bills", "ProBlogger Blog Tips", "Oilman", "Hot Air \302\273 Top Picks", "kottke.org", "Google Operating System", "Boing Boing", "Google Blogoscoped", "John Battelle's Searchblog", "Search Engine Watch Blog", "Eschaton", "Think Progress", "Daily Kos", "blog maverick", "Wired Top Stories", "Mashable!", "The Viral Garden", "Download Squad", "Gothamist", "Derek Powazek", "plasticbag.org", "Gizmodo", "Crooks and Liars", "TMZ.com", "A Consuming Experience (full feed)", "Talking Points Memo", "ScienceBlogs : Combined Feed", "MetaFilter", "Autoblog", "BuzzMachine", "WWdN: In Exile", "PaulStamatiou.com", "Cool Hunting", "Wonkette: The D.C. Gossip", "TechCrunch", "Sifry's Alerts", "Dave Shea's mezzoblue", "Little Green Footballs", "Giga Omni Media, Inc.", "Slashdot", "Topix.net Weblog", "Steve Pavlina's Personal Development Blog", "Gawker", "456 Berea Street", "Michelle Malkin", "The Unofficial Apple Weblog (TUAW)", "Schneier on Security", "ReadWriteWeb", "Neil Gaiman's Journal", "flagrantdisregard", "Jeremy Zawodny's blog", "Instapundit.com (v.2)", "Bloglines | News", "The Full Feed from HuffingtonPost.com", "TreeHugger", "43 Folders", "Matt Cutts: Gadgets, Google, and SEO", "MAKE Magazine", "Seth's Blog", "gapingvoid: "cartoons drawn on the back of business cards"", "Captain's Quarters"]


単語リスト

p colnames
["designed", "", "pretty", "approach", "folks", "perfect", "below", "appear", "plan", "across", "jpg", "play", "creative", "developers", "brings", "starts", "brought", "absolutely", "tried", "book", "success", "second", "similar", "costs", "street", "together", "rules", "anyone", "quite", "publishing", "working", "b", "f", "g", "h", "j", "k", "l", "direct", "price", "n", "o", "almost", "q", "r", "u", "v", "w", "x", "y", "z", "feedburner", "companies", "actual", "starting", "field", "next", "icon", "traditional", "behind", "tips", "maps", "isn", "lines", "middle", "wall", "solid", "presidential", "check", "mark", "wants", "gif", "blogspot", "goes", "writing", "green", "google", "needs", "scale", "biden", "bloggers", "however", "economy", "means", "links", "htm", "election", "super", "god", "expected", "bunch", "credit", "gov", "including", "small", "said", "course", "decided", "response", "plus", "hours", "dark", "runs", "place", "nbsp", "without", "code", "data", "date", "history", "reasons", "file", "police", "class", "speaking", "friday", "apparently", "audience", "visit", "guy", "false", "feeds", "fine", "smart", "living", "float", "games", "little", "standard", "october", "looks", "star", "weeks", "car", "stay", "london", "despite", "fire", "present", "problems", "lots", "friend", "tell", "allowscriptaccess", "tells", "details", "love", "attention", "culture", "crisis", "brand", "party", "each", "thinking", "step", "ceo", "pheedo", "five", "block", "home", "vote", "reading", "turns", "running", "created", "copy", "children", "term", "process", "technology", "amazon", "says", "seems", "finally", "possible", "control", "align", "blogs", "computer", "among", "sure", "ever", "especially", "eye", "series", "category", "host", "polls", "color", "text", "particular", "extra", "stop", "hour", "act", "office", "question", "customer", "keep", "speech", "calls", "move", "quality", "clip", "age", "object", "answer", "ago", "create", "dollars", "press", "education", "specific", "inside", "felt", "audio", "energy", "cut", "air", "easy", "moment", "track", "nytimes", "articles", "season", "outside", "life", "platform", "helps", "knowledge", "via", "public", "wireless", "minute", "church", "candidates", "taken", "several", "picture", "places", "span", "amp", "takes", "provides", "messages", "research", "changes", "larger", "talk", "points", "role", "does", "sunday", "hands", "wordpress", "light", "happy", "rather", "tag", "whole", "within", "build", "ask", "built", "started", "making", "line", "link", "offer", "questions", "explain", "jump", "legal", "ten", "happened", "future", "believe", "able", "readers", "wrong", "majority", "single", "high", "entertainment", "release", "gadgets", "percent", "president", "live", "best", "url", "social", "everyone", "sold", "washington", "human", "style", "she", "short", "client", "wrote", "chris", "blogger", "music", "tom", "top", "fall", "red", "around", "rel", "spend", "expect", "reviews", "mind", "wikipedia", "spent", "three", "adding", "having", "international", "fans", "obviously", "guide", "city", "global", "sort", "left", "anyway", "friends", "imagine", "early", "country", "board", "month", "reader", "software", "result", "easier", "record", "googleadservices", "pay", "under", "head", "pages", "strong", "hear", "community", "part", "permalink", "open", "per", "product", "advertising", "north", "study", "common", "rss", "stuff", "previous", "building", "auto", "feedproxy", "run", "watch", "system", "option", "lets", "local", "review", "area", "related", "features", "aren", "multiple", "different", "baby", "water", "profile", "devices", "wiki", "server", "border", "nice", "favorite", "doing", "matter", "once", "action", "looking", "myspace", "products", "writer", "words", "aspx", "thanks", "launch", "ones", "side", "old", "net", "leading", "months", "index", "wish", "consumer", "white", "makes", "during", "put", "limited", "embed", "applications", "bank", "ba", "bb", "bc", "camera", "bd", "bf", "updates", "took", "tool", "org", "network", "game", "br", "mentioned", "follow", "campaign", "pictures", "da", "db", "dc", "dd", "de", "either", "sarah", "df", "level", "interest", "added", "sometimes", "based", "law", "states", "marketing", "editor", "fa", "wonder", "fb", "total", "fc", "fd", "fe", "phone", "ff", "hspace", "non", "fm", "sources", "fs", "against", "playing", "position", "height", "static", "york", "hl", "hm", "spending", "let", "basic", "ht", "dead", "write", "family", "deal", "worth", "something", "voice", "else", "la", "message", "li", "table", "site", "lm", "late", "photo", "script", "himself", "involved", "economic", "content", "call", "least", "shot", "members", "deep", "show", "services", "reported", "pc", "near", "experience", "javascript", "mobile", "pm", "blockquote", "pr", "completely", "px", "whether", "black", "catch", "value", "sense", "books", "saying", "notice", "size", "lot", "low", "need", "quick", "sharing", "card", "td", "care", "th", "food", "knew", "tv", "looked", "provide", "idea", "woman", "vm", "views", "vs", "reason", "xb", "private", "hasn", "west", "instead", "difference", "appears", "break", "terms", "options", "simple", "minutes", "child", "women", "zm", "students", "simply", "flash", "four", "upon", "financial", "china", "focused", "author", "self", "moving", "sell", "security", "characters", "bold", "job", "joe", "whatever", "company", "james", "send", "numbers", "feature", "sent", "reality", "download", "longer", "library", "mail", "main", "issues", "html", "store", "giving", "story", "him", "thing", "posts", "hit", "archive", "topic", "must", "performance", "posted", "both", "order", "along", "recent", "haven", "wait", "special", "available", "later", "third", "above", "remember", "latest", "fan", "far", "probably", "sounds", "certainly", "customers", "itself", "param", "putting", "maybe", "hot", "likely", "growth", "various", "sales", "rate", "political", "widget", "between", "logo", "safe", "version", "called", "tools", "david", "amazing", "design", "wasn", "comment", "team", "fix", "kids", "movie", "funny", "unique", "gone", "pick", "tech", "traffic", "house", "center", "name", "nothing", "share", "entry", "groups", "sale", "large", "such", "million", "ways", "couple", "fox", "comes", "beautiful", "developer", "turned", "brain", "though", "directly", "phdo", "allow", "kind", "paper", "days", "lost", "end", "hundreds", "discussion", "understand", "personal", "stories", "charge", "did", "exactly", "reach", "fun", "radio", "republican", "launched", "original", "div", "number", "width", "yes", "come", "yet", "management", "showing", "weight", "save", "plans", "report", "cool", "nearly", "bad", "interview", "etc", "quickly", "worked", "bill", "weekly", "hope", "internet", "recently", "test", "fear", "support", "cost", "themselves", "gmail", "willing", "update", "missing", "war", "enough", "engine", "industry", "interface", "feed", "announced", "america", "feel", "cnn", "firefox", "trying", "web", "big", "tracker", "beyond", "development", "never", "bit", "add", "issue", "chicago", "road", "type", "ads", "thought", "interested", "released", "search", "computers", "powerful", "important", "releases", "government", "potential", "before", "rock", "couldn", "shockwave", "drop", "win", "drive", "telling", "higher", "others", "particularly", "box", "chance", "twitter", "buying", "player", "bottom", "taking", "alt", "learned", "choose", "padding", "things", "talks", "won", "service", "access", "assets", "watching", "vice", "change", "gallery", "iphone", "japanese", "buy", "official", "app", "palin", "piece", "article", "view", "further", "growing", "purchase", "sites", "away", "room", "barack", "example", "art", "program", "amount", "addition", "notes", "general", "voting", "usually", "everything", "point", "application", "according", "given", "using", "gives", "word", "face", "done", "san", "whose", "saw", "say", "business", "fact", "focus", "speed", "ability", "give", "info", "money", "list", "california", "talking", "average", "please", "michael", "morning", "popular", "competition", "beta", "set", "leaving", "none", "account", "serious", "written", "steve", "matters", "close", "turn", "lead", "individual", "school", "getting", "online", "six", "magazine", "billion", "page", "images", "soon", "front", "event", "choice", "yesterday", "apart", "events", "address", "paid", "continue", "note", "asking", "shows", "thread", "try", "final", "awesome", "trust", "might", "allowfullscreen", "actually", "obvious", "found", "problem", "fast", "learning", "science", "mccain", "forums", "edition", "offering", "rich", "certain", "conversation", "full", "facebook", "information", "happens", "famous", "federal", "offers", "title", "person", "pdf", "machines", "language", "following", "saturday", "swf", "ideas", "project", "businesses", "past", "again", "wide", "anything", "mean", "jobs", "seeing", "interesting", "space", "already", "ends", "paul", "market", "easily", "asked", "gets", "behavior", "consider", "american", "display", "php", "less", "wanted", "oct", "voters", "meet", "device", "didn", "testing", "click", "photos", "start", "since", "true", "state", "although", "files", "john", "thousands", "users", "feedflare", "someone", "rights", "august", "sound", "held", "comments", "free", "conference", "lives", "png", "perhaps", "help", "reports", "archives", "enjoy", "evidence", "pre", "microsoft", "risk", "pro", "needed", "mac", "mad", "power", "south", "man", "map", "used", "group", "told", "may", "become", "daily", "user", "uses", "website", "sign", "opportunity", "bush", "aa", "ab", "ac", "ad", "ae", "af", "cover", "touch", "al", "am", "unfortunately", "ap", "works", "networks", "includes", "men", "projects", "browser", "ca", "cb", "cc", "read", "cd", "ce", "cf", "forum", "vspace", "real", "image", "politics", "co", "forward", "wouldn", "mostly", "ea", "eb", "ec", "often", "ed", "ee", "ef", "yourself", "tough", "em", "en", "apple", "base", "earlier", "youtube", "named", "corporate", "register", "japan", "names", "coming", "id", "im", "parents", "worst", "yahoo", "machine", "include", "websites", "increasingly", "gave", "email", "clear", "night", "current", "learn", "university", "bring", "button", "mm", "mp", "apps", "mr", "margin", "candidate", "guess", "return", "target", "came", "oh", "source", "key", "ol", "strike", "videos", "blogging", "phones", "os", "currently", "major", "leave", "format", "half", "location", "national", "qm", "always", "newspaper", "heard", "november", "hand", "known", "knows", "times", "flickr", "mdash", "chief", "heart", "until", "myself", "went", "sports", "st", "windows", "former", "font", "happen", "results", "case", "cash", "uk", "ul", "um", "scene", "media", "rest", "huge", "seem", "seen", "hard", "doesn", "poll", "model", "creating", "obama", "form", "listen", "entire", "hate", "successful", "systems", "ym", "decision", "screen", "guys", "allows", "death", "selling", "digital", "attack", "cause", "itunes", "better", "contact", "connection", "weekend", "figure", "blank", "useful", "commercial", "race"]