Skip to content

[Python 代码块] 从Url中提取域名

python
from urllib.parse import urlparse

def parse_domain(url):
    second_domain = ['.com.cn', '.net.cn', '.org.cn', '.gov.cn', '.edu.cn', '.ac.cn', '.bj.cn', '.fj.cn', '.sh.cn', '.jx.cn', '.tj.cn', '.sd.cn', '.cq.cn', '.ha.cn', '.he.cn', '.hb.cn', '.sx.cn', '.hn.cn', '.nm.cn',
                     '.gd.cn', '.ln.cn', '.gx.cn', '.jl.cn', '.hi.cn', '.hl.cn', '.sc.cn', '.js.cn', '.gz.cn', '.zj.cn', '.yn.cn', '.ah.cn', '.xz.cn', '.sn.cn', '.tw.cn', '.gs.cn', '.hk.cn', '.qh.cn', '.mo.cn', '.nx.cn', '.xj.cn']
    result = urlparse(url)
    domains = result.netloc.split('.')
    domains_len = len(domains)
    if domains_len > 2:
        if '.{}.{}'.format(domains[-2], domains[-1]) in second_domain:
            return '{}.{}.{}'.format(domains[-3], domains[-2], domains[-1])
        else:
            return '{}.{}'.format(domains[-2], domains[-1])
    else:
        return '.'.join(domains)


if __name__ == "__main__":
    urls = ['https://abc.com/spreadsheet/ccc?key=blah-blah-blah-blah#gid=1',
            'https://abc.com.cn/spreadsheet/ccc?key=blah-blah-blah-blah#gid=1',
            'https://www.abc.com/spreadsheet/ccc?key=blah-blah-blah-blah#gid=1',
            'https://www.abc.com.cn/spreadsheet/ccc?key=blah-blah-blah-blah#gid=1',
            'https://aaa.bbb.abc.com/spreadsheet/ccc?key=blah-blah-blah-blah#gid=1']
    print([parse_domain(url) for url in urls])