获取URL的第二级域(java)
我想知道java中是否有用于提取URL中的二级域(SLD)的解析器或库,或者没有algo或regex的解析器或库。例如:获取URL的第二级域(java),java,url,Java,Url,我想知道java中是否有用于提取URL中的二级域(SLD)的解析器或库,或者没有algo或regex的解析器或库。例如: URI uri = new URI("http://www.mydomain.ltd.uk/blah/some/page.html"); String host = uri.getHost(); System.out.println(host); 其中打印: mydomain.ltd.uk 现在,我要做的是可靠地识别SLD(“ltd.uk”)组件。有什么想法吗 编辑:
URI uri = new URI("http://www.mydomain.ltd.uk/blah/some/page.html");
String host = uri.getHost();
System.out.println(host);
其中打印:
mydomain.ltd.uk
现在,我要做的是可靠地识别SLD(“ltd.uk”)组件。有什么想法吗
编辑:我正在寻找一个理想的通用解决方案,所以我会在“police.uk”中匹配“.uk”,在“bbc.co.uk”中匹配“.co.uk”,在“amazon.com”中匹配“.com”
谢谢如果你想要第二级域名,你可以在“.”上拆分字符串,并取最后两部分。当然,这是假设您总是有一个非特定于站点的二级域(因为这听起来像是您想要的)。对于您的具体情况,我没有答案-Jonathan的评论指出,您可能应该重构您的问题
尽管如此,我还是建议看一下项目的类别。它有很多有用的方法。由于Restlet是开源的,所以您不必使用整个库—您可以下载源代码并将一个类添加到您的项目中。不知道您的目的,但二级域可能对您意义不大。你可能需要找到,它下面的域名就是你要找的 ApacheHTTP组件(HttpClient 4)附带了处理此问题的类
org.apache.http.impl.cookie.PublicSuffixFilter
org.apache.http.impl.cookie.PublicSuffixListParser
您需要从这里下载公共后缀列表
public class TopLevelDomainChecker {
private Set<String> exceptions;
private Set<String> suffixes;
public void setPublicSuffixes(Collection<String> suffixes) {
this.suffixes = new HashSet<String>(suffixes);
}
public void setExceptions(Collection<String> exceptions) {
this.exceptions = new HashSet<String>(exceptions);
}
/**
* Checks if the domain is a TLD.
* @param domain
* @return
*/
public boolean isTLD(String domain) {
if (domain.startsWith("."))
domain = domain.substring(1);
// An exception rule takes priority over any other matching rule.
// Exceptions are ones that are not a TLD, but would match a pattern rule
// e.g. bl.uk is not a TLD, but the rule *.uk means it is. Hence there is an exception rule
// stating that bl.uk is not a TLD.
if (this.exceptions != null && this.exceptions.contains(domain))
return false;
if (this.suffixes == null)
return false;
if (this.suffixes.contains(domain))
return true;
// Try patterns. ie *.jp means that boo.jp is a TLD
int nextdot = domain.indexOf('.');
if (nextdot == -1)
return false;
domain = "*" + domain.substring(nextdot);
if (this.suffixes.contains(domain))
return true;
return false;
}
public String extractSLD(String domain)
{
String last = domain;
boolean anySLD = false;
do
{
if (isTLD(domain))
{
if (anySLD)
return last;
else
return "";
}
anySLD = true;
last = domain;
int nextDot = domain.indexOf(".");
if (nextDot == -1)
return "";
domain = domain.substring(nextDot+1);
} while (domain.length() > 0);
return "";
}
}
/**
* Parses the list from <a href="http://publicsuffix.org/">publicsuffix.org
* Copied from http://svn.apache.org/repos/asf/httpcomponents/httpclient/trunk/httpclient/src/main/java/org/apache/http/impl/cookie/PublicSuffixListParser.java
*/
public class TopLevelDomainParser {
private static final int MAX_LINE_LEN = 256;
private final TopLevelDomainChecker filter;
TopLevelDomainParser(TopLevelDomainChecker filter) {
this.filter = filter;
}
public void parse(Reader list) throws IOException {
Collection<String> rules = new ArrayList();
Collection<String> exceptions = new ArrayList();
BufferedReader r = new BufferedReader(list);
StringBuilder sb = new StringBuilder(256);
boolean more = true;
while (more) {
more = readLine(r, sb);
String line = sb.toString();
if (line.length() == 0) continue;
if (line.startsWith("//")) continue; //entire lines can also be commented using //
if (line.startsWith(".")) line = line.substring(1); // A leading dot is optional
// An exclamation mark (!) at the start of a rule marks an exception to a previous wildcard rule
boolean isException = line.startsWith("!");
if (isException) line = line.substring(1);
if (isException) {
exceptions.add(line);
} else {
rules.add(line);
}
}
filter.setPublicSuffixes(rules);
filter.setExceptions(exceptions);
}
private boolean readLine(Reader r, StringBuilder sb) throws IOException {
sb.setLength(0);
int b;
boolean hitWhitespace = false;
while ((b = r.read()) != -1) {
char c = (char) b;
if (c == '\n') break;
// Each line is only read up to the first whitespace
if (Character.isWhitespace(c)) hitWhitespace = true;
if (!hitWhitespace) sb.append(c);
if (sb.length() > MAX_LINE_LEN) throw new IOException("Line too long"); // prevent excess memory usage
}
return (b != -1);
}
}
FileReader fr = new FileReader("effective_tld_names.dat.txt");
TopLevelDomainChecker checker = new TopLevelDomainChecker();
TopLevelDomainParser parser = new TopLevelDomainParser(checker);
parser.parse(fr);
boolean result;
result = checker.isTLD("com"); // true
result = checker.isTLD("com.au"); // true
result = checker.isTLD("ltd.uk"); // true
result = checker.isTLD("google.com"); // false
result = checker.isTLD("google.com.au"); // false
result = checker.isTLD("metro.tokyo.jp"); // false
String sld;
sld = checker.extractSLD("com"); // ""
sld = checker.extractSLD("com.au"); // ""
sld = checker.extractSLD("google.com"); // "google.com"
sld = checker.extractSLD("google.com.au"); // "google.com.au"
sld = checker.extractSLD("www.google.com.au"); // "google.com.au"
sld = checker.extractSLD("www.google.com"); // "google.com"
sld = checker.extractSLD("foo.bar.hokkaido.jp"); // "foo.bar.hokkaido.jp"
sld = checker.extractSLD("moo.foo.bar.hokkaido.jp"); // "foo.bar.hokkaido.jp"
Set<String> nonePublicDomainParts(String uriHost) {
InternetDomainName fullDomainName = InternetDomainName.from(uriHost);
InternetDomainName publicDomainName = fullDomainName.publicSuffix();
Set<String> nonePublicParts = new HashSet<String>(fullDomainName.parts());
nonePublicParts.removeAll(publicDomainName.parts());
return nonePublicParts;
}
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>10.0.1</version>
<scope>compile</scope>
</dependency>
public static String getTopLevelDomain(String uri) {
InternetDomainName fullDomainName = InternetDomainName.from(uri);
InternetDomainName publicDomainName = fullDomainName.topPrivateDomain();
String topDomain = "";
Iterator<String> it = publicDomainName.parts().iterator();
while(it.hasNext()){
String part = it.next();
if(!topDomain.isEmpty())topDomain += ".";
topDomain += part;
}
return topDomain;
}