If you would like to go the regex route...
RFC-3986 is the authority regarding URIs. Appendix B provides this regex to break one down into its components:
re_3986 = r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?"
# Where:
# scheme = $2
# authority = $4
# path = $5
# query = $7
# fragment = $9
Here is an enhanced, Python friendly version which utilizes named capture groups. It is presented in a function within a working script:
import re
def get_domain(url):
"""Return top two domain levels from URI"""
re_3986_enhanced = re.compile(r"""
# Parse and capture RFC-3986 Generic URI components.
^ # anchor to beginning of string
(?: (?P<scheme> [^:/?#\s]+): )? # capture optional scheme
(?://(?P<authority> [^/?#\s]*) )? # capture optional authority
(?P<path> [^?#\s]*) # capture required path
(?:\?(?P<query> [^#\s]*) )? # capture optional query
(?:\#(?P<fragment> [^\s]*) )? # capture optional fragment
$ # anchor to end of string
""", re.MULTILINE | re.VERBOSE)
re_domain = re.compile(r"""
# Pick out top two levels of DNS domain from authority.
(?P<domain>[^.]+\.[A-Za-z]{2,6}) # $domain: top two domain levels.
(?::[0-9]*)? # Optional port number.
$ # Anchor to end of string.
""",
re.MULTILINE | re.VERBOSE)
result = ""
m_uri = re_3986_enhanced.match(url)
if m_uri and m_uri.group("authority"):
auth = m_uri.group("authority")
m_domain = re_domain.search(auth)
if m_domain and m_domain.group("domain"):
result = m_domain.group("domain");
return result
data_list = [
r"http://abdd.eesfea.domainname.com/b/33tA$/0021/file",
r"http://mail.domainname.org/abc/abc/aaa",
r"http://domainname.edu",
r"http://domainname.com:80",
r"http://domainname.com?query=one",
r"http://domainname.com#fragment",
]
cnt = 0
for data in data_list:
cnt += 1
print("Data[%d] domain = \"%s\"" %
(cnt, get_domain(data)))
For more information regarding the picking apart and validation of a URI according to RFC-3986, you may want to take a look at an article I've been working on: Regular Expression URI Validation
print '.%s' % ( '.'.join( "host.dom.com".split('.')[-2:] ) )gives.dom.com(in all cases I tried.)