tests:
mentions:
- description: "Extract mention at the begining of a tweet"
text: "@username reply"
expected: ["username"]
- description: "Extract mention at the end of a tweet"
text: "mention @username"
expected: ["username"]
- description: "Extract mention in the middle of a tweet"
text: "mention @username in the middle"
expected: ["username"]
- description: "Extract mention of username with underscore"
text: "mention @user_name"
expected: ["user_name"]
- description: "Extract mention of all numeric username"
text: "mention @12345"
expected: ["12345"]
- description: "Extract mention or multiple usernames"
text: "mention @username1 @username2"
expected: ["username1", "username2"]
- description: "Extract mention in the middle of a Japanese tweet"
text: "の@usernameに到着を待っている"
expected: ["username"]
- description: "DO NOT extract username ending in @"
text: "Current Status: @_@ (cc: @username)"
expected: ["username"]
- description: "DO NOT extract username followed by accented latin characters"
text: "@aliceìnheiro something something"
expected: []
- description: "Extract lone metion but not @user@user (too close to an email)"
text: "@username email me @test@example.com"
expected: ["username"]
mentions_with_indices:
- description: "Extract a mention at the start"
text: "@username yo!"
expected:
- screen_name: "username"
indices: [0, 9]
- description: "Extract a mention that has the same thing mentioned at the start"
text: "username @username"
expected:
- screen_name: "username"
indices: [9, 18]
- description: "Extract a mention in the middle of a Japanese tweet"
text: "の@usernameに到着を待っている"
expected:
- screen_name: "username"
indices: [1, 10]
replies:
- description: "Extract reply at the begining of a tweet"
text: "@username reply"
expected: "username"
- description: "Extract reply preceded by only a space"
text: " @username reply"
expected: "username"
- description: "Extract reply preceded by only a full-width space (U+3000)"
text: " @username reply"
expected: "username"
- description: "DO NOT Extract reply when preceded by text"
text: "a @username mention, not a reply"
expected:
- description: "DO NOT Extract reply when preceded by ."
text: ".@username mention, not a reply"
expected:
- description: "DO NOT Extract reply when preceded by /"
text: "/@username mention, not a reply"
expected:
- description: "DO NOT Extract reply when preceded by _"
text: "_@username mention, not a reply"
expected:
- description: "DO NOT Extract reply when preceded by -"
text: "-@username mention, not a reply"
expected:
- description: "DO NOT Extract reply when preceded by +"
text: "+@username mention, not a reply"
expected:
- description: "DO NOT Extract reply when preceded by #"
text: "#@username mention, not a reply"
expected:
- description: "DO NOT Extract reply when preceded by !"
text: "!@username mention, not a reply"
expected:
- description: "DO NOT Extract reply when preceded by @"
text: "@@username mention, not a reply"
expected:
urls:
- description: "Extract a lone URL"
text: "http://example.com"
expected: ["http://example.com"]
- description: "Extract valid URL: http://google.com"
text: "text http://google.com"
expected: ["http://google.com"]
- description: "Extract valid URL: http://foobar.com/#"
text: "text http://foobar.com/#"
expected: ["http://foobar.com/#"]
- description: "Extract valid URL: http://google.com/#foo"
text: "text http://google.com/#foo"
expected: ["http://google.com/#foo"]
- description: "Extract valid URL: http://google.com/#search?q=iphone%20-filter%3Alinks"
text: "text http://google.com/#search?q=iphone%20-filter%3Alinks"
expected: ["http://google.com/#search?q=iphone%20-filter%3Alinks"]
- description: "Extract valid URL: http://twitter.com/#search?q=iphone%20-filter%3Alinks"
text: "text http://twitter.com/#search?q=iphone%20-filter%3Alinks"
expected: ["http://twitter.com/#search?q=iphone%20-filter%3Alinks"]
- description: "Extract valid URL: http://somedomain.com/index.php?path=/abc/def/"
text: "text http://somedomain.com/index.php?path=/abc/def/"
expected: ["http://somedomain.com/index.php?path=/abc/def/"]
- description: "Extract valid URL: http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
text: "text http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
expected: ["http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"]
- description: "Extract valid URL: http://somehost.com:3000"
text: "text http://somehost.com:3000"
expected: ["http://somehost.com:3000"]
- description: "Extract valid URL: http://xo.com/~matthew+%ff-x"
text: "text http://xo.com/~matthew+%ff-x"
expected: ["http://xo.com/~matthew+%ff-x"]
- description: "Extract valid URL: http://xo.com/~matthew+%ff-,.;x"
text: "text http://xo.com/~matthew+%ff-,.;x"
expected: ["http://xo.com/~matthew+%ff-,.;x"]
- description: "Extract valid URL: http://xo.com/,.;x"
text: "text http://xo.com/,.;x"
expected: ["http://xo.com/,.;x"]
- description: "Extract valid URL: http://en.wikipedia.org/wiki/Primer_(film)"
text: "text http://en.wikipedia.org/wiki/Primer_(film)"
expected: ["http://en.wikipedia.org/wiki/Primer_(film)"]
- description: "Extract valid URL: http://www.ams.org/bookstore-getitem/item=mbk-59"
text: "text http://www.ams.org/bookstore-getitem/item=mbk-59"
expected: ["http://www.ams.org/bookstore-getitem/item=mbk-59"]
- description: "Extract valid URL: http://✪df.ws/ejp"
text: "text http://✪df.ws/ejp"
expected: ["http://✪df.ws/ejp"]
- description: "Extract valid URL: http://chilp.it/?77e8fd"
text: "text http://chilp.it/?77e8fd"
expected: ["http://chilp.it/?77e8fd"]
- description: "Extract valid URL: http://x.com/oneletterdomain"
text: "text http://x.com/oneletterdomain"
expected: ["http://x.com/oneletterdomain"]
- description: "DO NOT extract invalid URL: http://domain-begin_dash_2314352345_dfasd.foo-cow_4352.com"
text: "text http://domain-dash_2314352345_dfasd.foo-cow_4352.com"
expected: []
- description: "DO NOT extract invalid URL: http://-begin_dash_2314352345_dfasd.foo-cow_4352.com"
text: "text http://-dash_2314352345_dfasd.foo-cow_4352.com"
expected: []
- description: "DO NOT extract invalid URL: http://no-tld"
text: "text http://no-tld"
expected: []
- description: "DO NOT extract invalid URL: http://tld-too-short.x"
text: "text http://tld-too-short.x"
expected: []
- description: "Extract a very long hyphenated sub-domain URL (single letter hyphens)"
text: "text http://word-and-a-number-8-ftw.domain.tld/"
expected: ["http://word-and-a-number-8-ftw.domain.tld/"]
- description: "Extract a hyphenated TLD (usually a typo)"
text: "text http://domain.tld-that-you-should-have-put-a-space-after"
expected: ["http://domain.tld"]
- description: "Extract URL ending with # value"
text: "text http://foo.com?#foo text"
expected: ["http://foo.com?#foo"]
- description: "SHOULD NOT Extract URLs without protocol on (com|org|edu|gov|net) domains"
text: "foo.com foo.net foo.org foo.edu foo.gov"
expected: []
- description: "DO NOT extract URLs withour protocol not on (com|org|edu|gov|net) domains, even when preceded by www."
text: "foo.bar foo.co.jp www.foo.bar www.foo.co.uk wwwww.foo foo.comm foo.somecom foo.govedu"
expected: []
- description: "Extract URLs with a - or + at the end of the path"
text: "Go to http://example.com/a+ or http://example.com/a-"
expected: ["http://example.com/a+", "http://example.com/a-"]
- description: "Extract URLs with longer paths ending in -"
text: "Go to http://example.com/view/slug-url-?foo=bar"
expected: ["http://example.com/view/slug-url-?foo=bar"]
- description: "DO NOT extract URLs beginning with a space"
text: "@user Try http:// example.com/path"
expected: []
- description: "DO NOT extract URLs beginning with a non-breaking space (U+00A0)"
text: "@user Try http:// example.com/path"
expected: []
- description: "Extract URLs with underscores and dashes in the subdomain"
text: "test http://sub_domain-dash.twitter.com"
expected: ["http://sub_domain-dash.twitter.com"]
- description: "Extract URL with minimum number of valid characters"
text: "test http://a.b.cd"
expected: ["http://a.b.cd"]
- description: "Extract URLs containing underscores and dashes"
text: "test http://a_b.c-d.com"
expected: ["http://a_b.c-d.com"]
- description: "Extract URLs containing dashes in the subdomain"
text: "test http://a-b.c.com"
expected: ["http://a-b.c.com"]
- description: "Extract URLs with dashes in the domain name"
text: "test http://twitter-dash.com"
expected: ["http://twitter-dash.com"]
- description: "Extract URLs with lots of symbols then a period"
text: "http://www.bestbuy.com/site/Currie+Technologies+-+Ezip+400+Scooter/9885188.p?id=1218189013070&skuId=9885188"
expected: ["http://www.bestbuy.com/site/Currie+Technologies+-+Ezip+400+Scooter/9885188.p?id=1218189013070&skuId=9885188"]
- description: "DO NOT extract URLs containing leading dashes in the subdomain"
text: "test http://-leadingdash.twitter.com"
expected: []
- description: "DO NOT extract URLs containing trailing dashes in the subdomain"
text: "test http://trailingdash-.twitter.com"
expected: []
- description: "DO NOT extract URLs containing leading underscores in the subdomain"
text: "test http://_leadingunderscore.twitter.com"
expected: []
- description: "DO NOT extract URLs containing trailing underscores in the subdomain"
text: "test http://trailingunderscore_.twitter.com"
expected: []
- description: "DO NOT extract URLs containing leading dashes in the domain name"
text: "test http://-twitter.com"
expected: []
- description: "DO NOT extract URLs containing trailing dashes in the domain name"
text: "test http://twitter-.com"
expected: []
- description: "DO NOT extract URLs containing underscores in the domain name"
text: "test http://twitter_underscore.com"
expected: []
- description: "DO NOT extract URLs containing underscores in the tld"
text: "test http://twitter.c_o_m"
expected: []
urls_with_indices:
- description: "Extract a URL"
text: "text http://google.com"
expected:
- url: "http://google.com"
indices: [5, 22]
- description: "Extract a URL from a Japanese tweet"
text: "皆さん見てください! http://google.com"
expected:
- url: "http://google.com"
indices: [11, 28]
hashtags:
- description: "Extract an all-alpha hashtag"
text: "a #hashtag here"
expected: ["hashtag"]
- description: "Extract a letter-then-number hashtag"
text: "this is #hashtag1"
expected: ["hashtag1"]
- description: "Extract a number-then-letter hashtag"
text: "#1hashtag is this"
expected: ["1hashtag"]
- description: "DO NOT Extract an all-numeric hashtag"
text: "On the #16 bus"
expected: []
- description: "Extract a hashtag containing ñ"
text: "I'll write more tests #mañana"
expected: ["mañana"]
- description: "Extract a hashtag containing é"
text: "Working remotely #café"
expected: ["café"]
- description: "Extract a hashtag containing ü"
text: "Getting my Oktoberfest on #münchen"
expected: ["münchen"]
- description: "DO NOT Extract a hashtag containing Japanese"
text: "this is not valid: # 会議中 ハッシュ"
expected: []
- description: "Extract a hashtag in Korean"
text: "What is #트위터 anyway?"
expected: ["트위터"]
- description: "Extract a hashtag in Russian"
text: "What is #ашок anyway?"
expected: ["ашок"]
- description: "Extract a starting katakana hashtag"
text: "#カタカナ is a hashtag"
expected: ["カタカナ"]
- description: "Extract a starting hiragana hashtag"
text: "#ひらがな FTW!"
expected: ["ひらがな"]
- description: "Extract a starting kanji hashtag"
text: "#漢字 is the future"
expected: ["漢字"]
- description: "Extract a trailing katakana hashtag"
text: "Hashtag #カタカナ"
expected: ["カタカナ"]
- description: "Extract a trailing hiragana hashtag"
text: "Japanese hashtags #ひらがな"
expected: ["ひらがな"]
- description: "Extract a trailing kanji hashtag"
text: "Study time #漢字"
expected: ["漢字"]
- description: "Extract a central katakana hashtag"
text: "See my #カタカナ hashtag?"
expected: ["カタカナ"]
- description: "Extract a central hiragana hashtag"
text: "Study #ひらがな for fun and profit"
expected: ["ひらがな"]
- description: "Extract a central kanji hashtag"
text: "Some say #漢字 is the past. what do they know?"
expected: ["漢字"]
- description: "Extract a Kanji/Katakana mixed hashtag"
text: "日本語ハッシュタグテスト #日本語ハッシュタグ"
expected: ["日本語ハッシュタグ"]
- description: "Extract a hashtag after a punctuation"
text: "日本語ハッシュテスト。#日本語ハッシュタグ"
expected: ["日本語ハッシュタグ"]
- description: "DO NOT include a punctuation in a hashtag"
text: "#日本語ハッシュタグ。"
expected: ["日本語ハッシュタグ"]
- description: "Extract a full-width Alnum hashtag"
text: "全角英数字ハッシュタグ #hashtag123"
expected: ["hashtag123"]
- description: "DO NOT extract a hashtag without a preceding space"
text: "日本語ハッシュタグ#日本語ハッシュタグ"
expected: []
hashtags_with_indices:
- description: "Extract a hastag at the start"
text: "#hashtag here"
expected:
- hashtag: "hashtag"
indices: [0, 8]
- description: "Extract a hastag at the end"
text: "test a #hashtag"
expected:
- hashtag: "hashtag"
indices: [7, 15]
- description: "Extract a hastag in the middle"
text: "test a #hashtag in a string"
expected:
- hashtag: "hashtag"
indices: [7, 15]
- description: "Extract only a valid hashtag"
text: "#123 a #hashtag in a string"
expected:
- hashtag: "hashtag"
indices: [7, 15]
- description: "Extract a hastag in a string of multi-byte characters"
text: "会議中 #hashtag 会議中"
expected:
- hashtag: "hashtag"
indices: [4, 12]