Nov
3
2008

Java: Matching URLs with Regex Wildcards



Comments available as RSS 2.0

After someone suggested a way to match URLs and protocols with wildcards in LockCrypt, I started work implementing a URL which accepted wildcard (*) characters. The result is a class which takes a URL string as a constructor and breaks it apart into it’s component parts. The class is based on a JavaScript regex from Steve Levithan.

The full specification for the URL it constructs is protocol://user:password@host:port/direc/tory/file?query#ref. Any parts left blank are assumed to be wildcards. Performance averages out to about 0.05ms per check, not bad for a beefy regex.

Just the constructor is listed here, the full source is available at http://leghumped.com/WildcardURL.java.

public WildcardURL(String url) {
	HashMap<String, String> tempUri = new HashMap<String, String>(14);
	String[] parts = {"source","protocol","authority","userInfo","user","password","host","port","relative","path","directory","file","query","ref"};
	boolean strictMode = false;
	Pattern pattern;
	if(strictMode) {
		pattern = Pattern.compile("^(?:([^:/?#]+):)?(?://((?:(([^:@]*):?([^:@]*))?@)?([^:/?#]*)(?::(\\d*))?))?((((?:[^?#/]*/)*)([^?#]*))(?:\\?([^#]*))?(?:#(.*))?)");
	} else {
		pattern = Pattern.compile("^(?:(?![^:@]+:[^:@/]*@)([^:/?#.]+):)?(?://)?((?:(([^:@]*):?([^:@]*))?@)?([^:/?#]*)(?::(\\d*))?)(((/(?:[^?#](?![^?#/]*\\.[^?#/.]+(?:[?#]|$)))*/?)?([^?#/]*))(?:\\?([^#]*))?(?:#(.*))?)");
	}
	Matcher matcher = pattern.matcher(url);
	String match;
	if(matcher.find()) {
		for(int i=0;i<14;i++) {
			try {
				match = matcher.group(i);
			} catch(Exception ex) {
				match = "*";
			}
			tempUri.put(parts[i],  match == null ? "*" : match);
		}
	}
	this.protocol = tempUri.get("protocol");
	this.user = tempUri.get("user");
	this.password = tempUri.get("password");
	this.host = tempUri.get("host");
	this.directory = tempUri.get("directory");
	this.file = tempUri.get("file");
	this.query = tempUri.get("query");
	this.ref = tempUri.get("ref");
	try {
		this.port = Integer.parseInt(tempUri.get("port"));
	} catch(NumberFormatException ignore) {}
}

You can test it with the method below.

public static void wildcardURLs() {
	String[] currentURLs = {
		"http://www.lime49.com/",
		"http://lime49.com/",
		"http://lime49.com/blah",
		"ftp://ftp.lime49.com/",
		"ftp://lime49.co.uk/",
		"http://user@www.lime49.com:81/search.php?q1=test1",
		"http://user:pwd@www.lime49.com:81/search/for/search.php?q1=0&&test1&test2=value#top"
	};
	String[] accountURLs = {
		"*://www.lime49.com",
		"*://*lime49.com",
		"*://*.lime49.com/blah",
		"http://www.lime49.com/blah",
		"http://*.lime49.com/blah",
		"http://*.com/blah",
		"http://*lime49*/blah",
		"http://www.lime49.com:81/search/for/search.php",
		"*://*",
		"lime49",
		"",
	};
	URL currentURL;
	for(int i=0;i<currentURLs.length;i++) {
		try {
			System.out.println("Testing "+currentURLs[i]);
			currentURL = new URL(currentURLs[i]);
			System.out.println("Current URL\t\tAccount URL\t\tMatch");
			for(String accountURL : accountURLs) {
				System.out.print(currentURLs[i]+"\t\t"+accountURL+"\t\t");
				try {
					System.out.println(new WildcardURL(accountURL).matches(currentURL));
				} catch(Exception ex2) {
					System.out.println("Error parsing:"+accountURL);
				}
			}
		} catch(Exception ex) {
			System.out.println("Error parsing:"+currentURLs[i]);
		}
		System.out.println("\n");
	}
 
	long start = System.currentTimeMillis();
	int i;
	for(i = 0;i<2000;i++) {
		for(int j=0;j<currentURLs.length;j++) {
			try {
				currentURL = new URL(currentURLs[j]);
				for(String accountURL : accountURLs) {
					try {
						if(new WildcardURL(accountURL).matches(currentURL)) {
 
						};
					} catch(Exception ex2) {}
				}
			} catch(Exception ex) {}
		}
	}
	long millis = System.currentTimeMillis()-start;
	int numChecks = (i*currentURLs.length*accountURLs.length);
	System.out.println("Completed "+numChecks+" match checks in "+millis+"ms - Avg = "+new java.math.BigDecimal((millis/(double)numChecks)).setScale(3,java.math.BigDecimal.ROUND_HALF_UP)+"ms/check");
}

Comments

Leave a Comment

Login using OpenID or enter your details below to leave a comment.

OpenID
Anonymous


Comment