# This test file is published under license  Apache License 2.0
# https://github.com/crawler-commons/crawler-commons/blob/master/src/test/resources/normalizer/weirdToNormalizedUrls.csv
# Version 94bac65 on Jan 4 2021

### XXX A few small changes (clearly marked) were needed to fix the input. XXX

# Weird URL, Normalized URL

# check that % encoding is normalized
http://foo.com/%66oo.html, http://foo.com/foo.html

# check that % encoding works correctly at end of URL
http://foo.com/%66oo.htm%6c, http://foo.com/foo.html
http://foo.com/%66oo.ht%6dl, http://foo.com/foo.html

# check that % decoder do not overlap strings
http://foo.com/%66oo.ht%6d%6c, http://foo.com/foo.html

# check that % decoder leaves high bit chars alone
#XXX No: broken UTF8
#XXX http://foo.com/%66oo.htm%C0, http://foo.com/foo.htm%C0

# check that % decoder leaves control chars alone
http://foo.com/%66oo.htm%1A, http://foo.com/foo.htm%1A

# check that % decoder converts to upper case letters
#XXX http://foo.com/%66oo.htm%c0, http://foo.com/foo.htm%C0
http://foo.com/%66oo.htm%1a, http://foo.com/foo.htm%1A

# check that % decoder leaves encoded spaces alone
http://foo.com/you%20too.html, http://foo.com/you%20too.html

# check that spaces are encoded into %20
http://foo.com/you too.html, http://foo.com/you%20too.html

# check that encoded # are not decoded
http://foo.com/file.html%23cz, http://foo.com/file.html%23cz

# check that encoded / are not decoded
http://foo.com/fast/dir%2fcz, http://foo.com/fast/dir%2Fcz

# check that control chars are encoded
http://foo.com/!, http://foo.com/%1A!

# check that control chars are always encoded into 2 digits
http://foo.com/!, http://foo.com/%01!

# encoding of Spanish chars
http://mydomain.com/en Español.aspx, http://mydomain.com/en%20Espa%C3%B1ol.aspx


# Ampersand and colon and other punctuation characters are not to be unescaped
http://x.com/s?q=a%26b&m=10, http://x.com/s?q=a%26b&m=10
http://x.com/show?http%3A%2F%2Fx.com%2Fb, http://x.com/show?http%3A%2F%2Fx.com%2Fb
http://google.com/search?q=c%2B%2B, http://google.com/search?q=c%2B%2B

# do also not touch the query part which is application/x-www-form-urlencoded
#XXX http://x.com/s?q=a+b, http://x.com/s?q=a+b
#XXX The source lib does not unify + and %20
http://x.com/s?q=a+b, http://x.com/s?q=a%20b

# convert Internationalized Domain Names (IDNs) fro Unicode to Punycode #248
# (definitely do not apply percent-encoding: http://b%C3%BCcher.de/)
http://bücher.de/, http://xn--bcher-kva.de/
http://êxample.com, http://xn--xample-hva.com/
https://нэб.рф/, https://xn--90ax2c.xn--p1ai/

# unescape percent-encoded characters also in IDNs
https://www.0251-sachverst%c3%a4ndiger.de/, https://www.xn--0251-sachverstndiger-ozb.de/

# test whether percent-encoding works together with other normalizations
http://x.com/./a/../%66.html, http://x.com/f.html

# [ and ] need escaping as well
http://x.com/?x[y]=1, http://x.com/?x%5By%5D=1

# boundary test for first character outside the ASCII range (U+0080)
http://x.com/foo€, http://x.com/foo%C2%80
http://x.com/foo%c2%80, http://x.com/foo%C2%80


# testNormalizer
# --------------
# check that leading and trailing spaces are removed
http://foo.com/ , http://foo.com/

# check that protocol is lower cased
http://foo.com/, http://foo.com/

# check that host is lower cased
http://Foo.Com/index.html, http://foo.com/index.html
http://Foo.Com/index.html, http://foo.com/index.html

# unescape percent characters in host names
https://example%2Ecom/, https://example.com/

# check that port number is normalized
http://foo.com:80/index.html, http://foo.com/index.html
http://foo.com:81/, http://foo.com:81/

# check that empty port is removed
http://example.com:/, http://example.com/
https://example.com:/foobar.html, https://example.com/foobar.html

# check that null path is normalized
http://foo.com, http://foo.com/

# check that references are removed
http://foo.com/foo.html#ref, http://foo.com/foo.html

# check that encoding is normalized
http://foo.com/%66oo.html, http://foo.com/foo.html

# check that unnecessary ../ are removed
http://foo.com/aa/./foo.html, http://foo.com/aa/foo.html
http://foo.com/aa/../, http://foo.com/
http://foo.com/aa/bb/../, http://foo.com/aa/
http://foo.com/aa/.., http://foo.com/
http://foo.com/aa/bb/cc/../../foo.html, http://foo.com/aa/foo.html
http://foo.com/aa/bb/../cc/dd/../ee/foo.html, http://foo.com/aa/cc/ee/foo.html
http://foo.com/../foo.html, http://foo.com/foo.html
http://foo.com/../../foo.html, http://foo.com/foo.html
http://foo.com/../aa/../foo.html, http://foo.com/foo.html
http://foo.com/aa/../../foo.html, http://foo.com/foo.html
http://foo.com/aa/../bb/../foo.html/../../, http://foo.com/
http://foo.com/../aa/foo.html, http://foo.com/aa/foo.html
http://foo.com/../aa/../foo.html, http://foo.com/foo.html
http://foo.com/a..a/foo.html, http://foo.com/a..a/foo.html
http://foo.com/a..a/../foo.html, http://foo.com/foo.html
http://foo.com/foo.foo/../foo.html, http://foo.com/foo.html
http://foo.com//aa/bb/foo.html, http://foo.com/aa/bb/foo.html
http://foo.com/aa//bb/foo.html, http://foo.com/aa/bb/foo.html
http://foo.com/aa/bb//foo.html, http://foo.com/aa/bb/foo.html
http://foo.com//aa//bb//foo.html, http://foo.com/aa/bb/foo.html
http://foo.com////aa////bb//foo.html, http://foo.com/aa/bb/foo.html
http://foo.com////aa////bb////foo.html, http://foo.com/aa/bb/foo.html

#XXX http://foo.com/aa?referer=http://bar.com, http://foo.com/aa?referer=http://bar.com
#XXX Browsers find :/ in query not sensitive, and unequal to their hex encoded version,
#XXX but we follow the rules more strictly for better normalization.
http://foo.com/aa?referer=http://bar.com, http://foo.com/aa?referer=http%3A%2F%2Fbar.com

# also normalize  /..  (already in the root directory)
http://foo.com/.., http://foo.com/

# check URLs without host (authority)
file:///foo/bar.txt, file:///foo/bar.txt
ftp:/, ftp:/

#XXX next three are really broken
#XXX http:, http:/
#XXX http:////, http:/
http:////, http://localhost/
#XXX http:///////, http:/
http:///////, http://localhost/

# empty path with trailing question mark (empty query) #247
#XXX http://example.com?,http://example.com/?
http://example.com?,http://example.com/
http://example.com?a=1,http://example.com/?a=1

# normalizing percent escapes #263
#XXX Although this is not uncommon, still not supported because fragile
#XXX https://www.last.fm/music/Prefuse+73/_/90%+of+My+Mind+Is+With+You,https://www.last.fm/music/Prefuse+73/_/90%25+of+My+Mind+Is+With+You

# escape curly braces properly
http://foo.com/{{stuff}} , http://foo.com/%7B%7Bstuff%7D%7D

# special characters in path/query
#XXX "http://www.example.com/a/c/../b/search?q=foobar""", http://www.example.com/a/b/search?q=foobar%22
#XXX http://www.example.com/a/c/../b/search?q=foobar%, http://www.example.com/a/b/search?q=foobar%25
http://www.example.com/a/c/../b/search?q=foobar<, http://www.example.com/a/b/search?q=foobar%3C
http://www.example.com/a/c/../b/search?q=foobar>, http://www.example.com/a/b/search?q=foobar%3E
http://www.example.com/a/c/../b/search?q=foobar^, http://www.example.com/a/b/search?q=foobar%5E
http://www.example.com/a/c/../b/search?q=foobar`, http://www.example.com/a/b/search?q=foobar%60
http://www.example.com/a/c/../b/search?q=foobar|, http://www.example.com/a/b/search?q=foobar%7C

# escape percent sign if it's initial to an invalid escape sequence
#XXX http://www.example.com/p%zz%77%v, http://www.example.com/p%25zzw%25v

# boundary test: percent sign close to the end of string
#XXX http://www.example.com/search?q=foobar%, http://www.example.com/search?q=foobar%25
#XXX http://www.example.com/search?q=foobar%2, http://www.example.com/search?q=foobar%252
http://www.example.com/search?q=foobar%25, http://www.example.com/search?q=foobar%25
http://www.example.com/search?q=foobar%252, http://www.example.com/search?q=foobar%252

# protocol to be lowercased
HTTP://foo.com/, http://foo.com/

# no protocol/scheme, see #271
foo.com/index.html, http://foo.com/index.html
# but do not break potentially valid URLs with less-used schemes
ftp://foo.com/index.html, ftp://foo.com/index.html
file:/path/index.html, file:/path/index.html

# removal of trailing dot in hostname
https://www.example.org./, https://www.example.org/

# file:/ URLs
file:/var/www/html/////./bar/index.html, file:/var/www/html/bar/index.html
file:/var/www/html/foo/../bar/index.html, file:/var/www/html/bar/index.html