You have to process response html to detect all anchor tag using RegEx.
See Example:
See Example:
import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; public class LinkExt { private Pattern patternTag, patternLink; private Matcher matcherTag, matcherLink; private static final String HTML_A_TAG_PATTERN = "(?i)]+)>(.+?)"; private static final String HTML_A_HREF_TAG_PATTERN = "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))"; public LinkExt() { patternTag = Pattern.compile(HTML_A_TAG_PATTERN); patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN); } /** * Validate html with regular expression * * @param html * html content for validation * @return Vector links and link text */ public VectorgrabHTMLLinks(final String html) { Vector result = new Vector (); matcherTag = patternTag.matcher(html); while (matcherTag.find()) { String href = matcherTag.group(1); // href String linkText = matcherTag.group(2); // link text matcherLink = patternLink.matcher(href); while (matcherLink.find()) { String link = matcherLink.group(1); // link HtmlLink obj = new HtmlLink(); obj.setLink(link); obj.setLinkText(linkText); result.add(obj); } } return result; } class HtmlLink { String link; String linkText; HtmlLink(){}; @Override public String toString() { return new StringBuffer("Link : ").append(this.link).append(" :: ") .append(" Link Text : ").append(this.linkText).toString(); } public String getLink() { return link; } public void setLink(String link) { this.link = replaceInvalidChar(link); } public String getLinkText() { return linkText; } public void setLinkText(String linkText) { this.linkText = linkText; } private String replaceInvalidChar(String link){ link = link.replaceAll("'", ""); link = link.replaceAll("\"", ""); return link; } } public static void main(String[] args) { try { //HTML RESPONSE URL u = new URL("http://www.google.com"); URLConnection conn = u.openConnection(); BufferedReader in = new BufferedReader( new InputStreamReader( conn.getInputStream())); StringBuffer buffer = new StringBuffer(); String inputLine; while ((inputLine = in.readLine()) != null) buffer.append(inputLine); in.close(); System.out.println("HTML RESPONSE: "+buffer.toString()); //HTML RESPONSE //LINK EXTRATOR LinkExt linkExt = new LinkExt(); Vector links = linkExt.grabHTMLLinks(buffer.toString()); for (int i = 0; i < links.size(); i++) { HtmlLink htmlLinks = links.get(i); System.out.println(htmlLinks); } //LINK EXTRACTOR }catch(Exception ex) { ex.printStackTrace(); } } }
0 comments:
Post a Comment