Java: Matching URLs with Regex Wildcards
After someone suggested a way to match URLs and protocols with wildcards in LockCrypt, I started work implementing a URL which accepted wildcard (*) characters. The result is a class which takes a URL string as a constructor and breaks it apart into it’s component parts. The class is based on a JavaScript regex from Steve Levithan.
The full specification for the URL it constructs is protocol://user:password@host:port/direc/tory/file?query#ref. Any parts left blank are assumed to be wildcards. Performance averages out to about 0.05ms per check, not bad for a beefy regex.
Just the constructor is listed here, the full source is available at http://leghumped.com/WildcardURL.java.
public WildcardURL(String url) { HashMap<String, String> tempUri = new HashMap<String, String>(14); String[] parts = {"source","protocol","authority","userInfo","user","password","host","port","relative","path","directory","file","query","ref"}; boolean strictMode = false; Pattern pattern; if(strictMode) { pattern = Pattern.compile("^(?:([^:/?#]+):)?(?://((?:(([^:@]*):?([^:@]*))?@)?([^:/?#]*)(?::(\\d*))?))?((((?:[^?#/]*/)*)([^?#]*))(?:\\?([^#]*))?(?:#(.*))?)"); } else { pattern = Pattern.compile("^(?:(?![^:@]+:[^:@/]*@)([^:/?#.]+):)?(?://)?((?:(([^:@]*):?([^:@]*))?@)?([^:/?#]*)(?::(\\d*))?)(((/(?:[^?#](?![^?#/]*\\.[^?#/.]+(?:[?#]|$)))*/?)?([^?#/]*))(?:\\?([^#]*))?(?:#(.*))?)"); } Matcher matcher = pattern.matcher(url); String match; if(matcher.find()) { for(int i=0;i<14;i++) { try { match = matcher.group(i); } catch(Exception ex) { match = "*"; } tempUri.put(parts[i], match == null ? "*" : match); } } this.protocol = tempUri.get("protocol"); this.user = tempUri.get("user"); this.password = tempUri.get("password"); this.host = tempUri.get("host"); this.directory = tempUri.get("directory"); this.file = tempUri.get("file"); this.query = tempUri.get("query"); this.ref = tempUri.get("ref"); try { this.port = Integer.parseInt(tempUri.get("port")); } catch(NumberFormatException ignore) {} }
You can test it with the method below.
public static void wildcardURLs() { String[] currentURLs = { "http://www.lime49.com/", "http://lime49.com/", "http://lime49.com/blah", "ftp://ftp.lime49.com/", "ftp://lime49.co.uk/", "http://user@www.lime49.com:81/search.php?q1=test1", "http://user:pwd@www.lime49.com:81/search/for/search.php?q1=0&&test1&test2=value#top" }; String[] accountURLs = { "*://www.lime49.com", "*://*lime49.com", "*://*.lime49.com/blah", "http://www.lime49.com/blah", "http://*.lime49.com/blah", "http://*.com/blah", "http://*lime49*/blah", "http://www.lime49.com:81/search/for/search.php", "*://*", "lime49", "", }; URL currentURL; for(int i=0;i<currentURLs.length;i++) { try { System.out.println("Testing "+currentURLs[i]); currentURL = new URL(currentURLs[i]); System.out.println("Current URL\t\tAccount URL\t\tMatch"); for(String accountURL : accountURLs) { System.out.print(currentURLs[i]+"\t\t"+accountURL+"\t\t"); try { System.out.println(new WildcardURL(accountURL).matches(currentURL)); } catch(Exception ex2) { System.out.println("Error parsing:"+accountURL); } } } catch(Exception ex) { System.out.println("Error parsing:"+currentURLs[i]); } System.out.println("\n"); } long start = System.currentTimeMillis(); int i; for(i = 0;i<2000;i++) { for(int j=0;j<currentURLs.length;j++) { try { currentURL = new URL(currentURLs[j]); for(String accountURL : accountURLs) { try { if(new WildcardURL(accountURL).matches(currentURL)) { }; } catch(Exception ex2) {} } } catch(Exception ex) {} } } long millis = System.currentTimeMillis()-start; int numChecks = (i*currentURLs.length*accountURLs.length); System.out.println("Completed "+numChecks+" match checks in "+millis+"ms - Avg = "+new java.math.BigDecimal((millis/(double)numChecks)).setScale(3,java.math.BigDecimal.ROUND_HALF_UP)+"ms/check"); }


Comments
Leave a Comment