Rectangle 27 2

The following code makes these modifications to the "Atwood Approach":

  • The CASE_INSENSTIVE flag is used since HtTpS:// is valid.
  • Matching sets of parentheses are peeled off (they can be nested to any level). Further, any remaining unmatched left parentheses are stripped, but trailing right parentheses are left intact (to respect wikipedia-style URLs)
  • The URL is HTML Encoded in the link text.
  • The target attribute is passed in via method parameter. Other attributes can be added as desired.
  • It does not use \b to identify a word break before matching a URL. URLs can begin with a left parenthesis or http[s]:// with no other requirement.
  • Apache Commons Lang's StringUtils are used in the code below
  • The call to HtmlUtil.encode() below is a util which ultimately calls some Tomahawk code to HTML-encode the link text, but any similar utility will do.
  • See the method comment for a usage in JSF or other environments where output is HTML Encoded by default.

This was written in response to our client's requirements and we feel it represents a reasonable compromise between the allowable characters from the RFC and common usage. It is offered here in the hopes that it will be useful to others.

Further expansion could be made which would allow for any Unicode characters to be entered (i.e. not escaped with %XX (two digit hex) and hyperlinked, but that would require accepting all Unicode letters plus limited punctuation and then splitting on the "acceptable" delimiters (eg. .,%,|,#, etc.), URL-encoding each part and then gluing back together. For example, http://en.wikipedia.org/wiki/Bjrn_Andrsen (which the Stack Overflow generator does not detect) would be "http://en.wikipedia.org/wiki/Bj%C3%B6rn_Andr%C3%A9sen" in the href, but would contain Bjrn_Andrsen in the linked text on the page.

// NOTES:   1) \w includes 0-9, a-z, A-Z, _
//          2) The leading '-' is the '-' character. It must go first in character class expression
private static final String VALID_CHARS = "-\\w+&@#/%=~()|";
private static final String VALID_NON_TERMINAL = "?!:,.;";

// Notes on the expression:
//  1) Any number of leading '(' (left parenthesis) accepted.  Will be dealt with.  
//  2) s? ==> the s is optional so either [http, https] accepted as scheme
//  3) All valid chars accepted and then one or more
//  4) Case insensitive so that the scheme can be hTtPs (for example) if desired
private static final Pattern URI_FINDER_PATTERN = Pattern.compile("\\(*https?://["+ VALID_CHARS + VALID_NON_TERMINAL + "]*[" +VALID_CHARS + "]", Pattern.CASE_INSENSITIVE );

/**
 * <p>
 * Finds all "URL"s in the given _rawText, wraps them in 
 * HTML link tags and returns the result (with the rest of the text
 * html encoded).
 * </p>
 * <p>
 * We employ the procedure described at:
 * http://www.codinghorror.com/blog/2008/10/the-problem-with-urls.html
 * which is a <b>must-read</b>.
 * </p>
 * Basically, we allow any number of left parenthesis (which will get stripped away)
 * followed by http:// or https://.  Then any number of permitted URL characters
 * (based on http://www.ietf.org/rfc/rfc1738.txt) followed by a single character
 * of that set (basically, those minus typical punctuation).  We remove all sets of 
 * matching left & right parentheses which surround the URL.
 *</p>
 * <p>
 * This method *must* be called from a tag/component which will NOT
 * end up escaping the output.  For example:
 * <PRE>
 * <h:outputText ... escape="false" value="#{core:hyperlinkText(textThatMayHaveURLs, '_blank')}"/>
 * </pre>
 * </p>
 * <p>
 * Reason: we are adding <code>&lt;a href="..."&gt;</code> tags to the output *and*
 * encoding the rest of the string.  So, encoding the outupt will result in
 * double-encoding data which was already encoded - and encoding the <code>a href</code>
 * (which will render it useless).
 * </p>
 * <p>
 * 
 * @param   _rawText  - if <code>null</code>, returns <code>""</code> (empty string).
 * @param   _target   - if not <code>null</code> or <code>""</code>, adds a target attributed to the generated link, using _target as the attribute value.
 */
public static final String hyperlinkText( final String _rawText, final String _target ) {

    String returnValue = null;

    if ( !StringUtils.isBlank( _rawText ) ) {

        final Matcher matcher = URI_FINDER_PATTERN.matcher( _rawText );

        if ( matcher.find() ) {

            final int originalLength    =   _rawText.length();

            final String targetText = ( StringUtils.isBlank( _target ) ) ? "" :  " target=\"" + _target.trim() + "\"";
            final int targetLength      =   targetText.length();

            // Counted 15 characters aside from the target + 2 of the URL (max if the whole string is URL)
            // Rough guess, but should keep us from expanding the Builder too many times.
            final StringBuilder returnBuffer = new StringBuilder( originalLength * 2 + targetLength + 15 );

            int currentStart;
            int currentEnd;
            int lastEnd     = 0;

            String currentURL;

            do {
                currentStart = matcher.start();
                currentEnd = matcher.end();
                currentURL = matcher.group();

                // Adjust for URLs wrapped in ()'s ... move start/end markers
                //      and substring the _rawText for new URL value.
                while ( currentURL.startsWith( "(" ) && currentURL.endsWith( ")" ) ) {
                    currentStart = currentStart + 1;
                    currentEnd = currentEnd - 1;

                    currentURL = _rawText.substring( currentStart, currentEnd );
                }

                while ( currentURL.startsWith( "(" ) ) {
                    currentStart = currentStart + 1;

                    currentURL = _rawText.substring( currentStart, currentEnd );
                }

                // Text since last match
                returnBuffer.append( HtmlUtil.encode( _rawText.substring( lastEnd, currentStart ) ) );

                // Wrap matched URL
                returnBuffer.append( "<a href=\"" + currentURL + "\"" + targetText + ">" + currentURL + "</a>" );

                lastEnd = currentEnd;

            } while ( matcher.find() );

            if ( lastEnd < originalLength ) {
                returnBuffer.append( HtmlUtil.encode( _rawText.substring( lastEnd ) ) );
            }

            returnValue = returnBuffer.toString();
        }
    } 

    if ( returnValue == null ) {
        returnValue = HtmlUtil.encode( _rawText );
    }

    return returnValue;

}

java - How to detect the presence of URL in a string - Stack Overflow

java url
Rectangle 27 2

The following code makes these modifications to the "Atwood Approach":

  • The CASE_INSENSTIVE flag is used since HtTpS:// is valid.
  • Matching sets of parentheses are peeled off (they can be nested to any level). Further, any remaining unmatched left parentheses are stripped, but trailing right parentheses are left intact (to respect wikipedia-style URLs)
  • The URL is HTML Encoded in the link text.
  • The target attribute is passed in via method parameter. Other attributes can be added as desired.
  • It does not use \b to identify a word break before matching a URL. URLs can begin with a left parenthesis or http[s]:// with no other requirement.
  • Apache Commons Lang's StringUtils are used in the code below
  • The call to HtmlUtil.encode() below is a util which ultimately calls some Tomahawk code to HTML-encode the link text, but any similar utility will do.
  • See the method comment for a usage in JSF or other environments where output is HTML Encoded by default.

This was written in response to our client's requirements and we feel it represents a reasonable compromise between the allowable characters from the RFC and common usage. It is offered here in the hopes that it will be useful to others.

Further expansion could be made which would allow for any Unicode characters to be entered (i.e. not escaped with %XX (two digit hex) and hyperlinked, but that would require accepting all Unicode letters plus limited punctuation and then splitting on the "acceptable" delimiters (eg. .,%,|,#, etc.), URL-encoding each part and then gluing back together. For example, http://en.wikipedia.org/wiki/Bjrn_Andrsen (which the Stack Overflow generator does not detect) would be "http://en.wikipedia.org/wiki/Bj%C3%B6rn_Andr%C3%A9sen" in the href, but would contain Bjrn_Andrsen in the linked text on the page.

// NOTES:   1) \w includes 0-9, a-z, A-Z, _
//          2) The leading '-' is the '-' character. It must go first in character class expression
private static final String VALID_CHARS = "-\\w+&@#/%=~()|";
private static final String VALID_NON_TERMINAL = "?!:,.;";

// Notes on the expression:
//  1) Any number of leading '(' (left parenthesis) accepted.  Will be dealt with.  
//  2) s? ==> the s is optional so either [http, https] accepted as scheme
//  3) All valid chars accepted and then one or more
//  4) Case insensitive so that the scheme can be hTtPs (for example) if desired
private static final Pattern URI_FINDER_PATTERN = Pattern.compile("\\(*https?://["+ VALID_CHARS + VALID_NON_TERMINAL + "]*[" +VALID_CHARS + "]", Pattern.CASE_INSENSITIVE );

/**
 * <p>
 * Finds all "URL"s in the given _rawText, wraps them in 
 * HTML link tags and returns the result (with the rest of the text
 * html encoded).
 * </p>
 * <p>
 * We employ the procedure described at:
 * http://www.codinghorror.com/blog/2008/10/the-problem-with-urls.html
 * which is a <b>must-read</b>.
 * </p>
 * Basically, we allow any number of left parenthesis (which will get stripped away)
 * followed by http:// or https://.  Then any number of permitted URL characters
 * (based on http://www.ietf.org/rfc/rfc1738.txt) followed by a single character
 * of that set (basically, those minus typical punctuation).  We remove all sets of 
 * matching left & right parentheses which surround the URL.
 *</p>
 * <p>
 * This method *must* be called from a tag/component which will NOT
 * end up escaping the output.  For example:
 * <PRE>
 * <h:outputText ... escape="false" value="#{core:hyperlinkText(textThatMayHaveURLs, '_blank')}"/>
 * </pre>
 * </p>
 * <p>
 * Reason: we are adding <code>&lt;a href="..."&gt;</code> tags to the output *and*
 * encoding the rest of the string.  So, encoding the outupt will result in
 * double-encoding data which was already encoded - and encoding the <code>a href</code>
 * (which will render it useless).
 * </p>
 * <p>
 * 
 * @param   _rawText  - if <code>null</code>, returns <code>""</code> (empty string).
 * @param   _target   - if not <code>null</code> or <code>""</code>, adds a target attributed to the generated link, using _target as the attribute value.
 */
public static final String hyperlinkText( final String _rawText, final String _target ) {

    String returnValue = null;

    if ( !StringUtils.isBlank( _rawText ) ) {

        final Matcher matcher = URI_FINDER_PATTERN.matcher( _rawText );

        if ( matcher.find() ) {

            final int originalLength    =   _rawText.length();

            final String targetText = ( StringUtils.isBlank( _target ) ) ? "" :  " target=\"" + _target.trim() + "\"";
            final int targetLength      =   targetText.length();

            // Counted 15 characters aside from the target + 2 of the URL (max if the whole string is URL)
            // Rough guess, but should keep us from expanding the Builder too many times.
            final StringBuilder returnBuffer = new StringBuilder( originalLength * 2 + targetLength + 15 );

            int currentStart;
            int currentEnd;
            int lastEnd     = 0;

            String currentURL;

            do {
                currentStart = matcher.start();
                currentEnd = matcher.end();
                currentURL = matcher.group();

                // Adjust for URLs wrapped in ()'s ... move start/end markers
                //      and substring the _rawText for new URL value.
                while ( currentURL.startsWith( "(" ) && currentURL.endsWith( ")" ) ) {
                    currentStart = currentStart + 1;
                    currentEnd = currentEnd - 1;

                    currentURL = _rawText.substring( currentStart, currentEnd );
                }

                while ( currentURL.startsWith( "(" ) ) {
                    currentStart = currentStart + 1;

                    currentURL = _rawText.substring( currentStart, currentEnd );
                }

                // Text since last match
                returnBuffer.append( HtmlUtil.encode( _rawText.substring( lastEnd, currentStart ) ) );

                // Wrap matched URL
                returnBuffer.append( "<a href=\"" + currentURL + "\"" + targetText + ">" + currentURL + "</a>" );

                lastEnd = currentEnd;

            } while ( matcher.find() );

            if ( lastEnd < originalLength ) {
                returnBuffer.append( HtmlUtil.encode( _rawText.substring( lastEnd ) ) );
            }

            returnValue = returnBuffer.toString();
        }
    } 

    if ( returnValue == null ) {
        returnValue = HtmlUtil.encode( _rawText );
    }

    return returnValue;

}

java - How to detect the presence of URL in a string - Stack Overflow

java url