Thursday 21 November 2013

Extarct Web Link Using Java

You have to process response html to detect all anchor tag using RegEx.
See Example:
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class LinkExt {
 
 private Pattern patternTag, patternLink;
 private Matcher matcherTag, matcherLink;
 
 private static final String HTML_A_TAG_PATTERN = "(?i)]+)>(.+?)";
 private static final String HTML_A_HREF_TAG_PATTERN = 
  "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";
 
 
 public LinkExt() {
  patternTag = Pattern.compile(HTML_A_TAG_PATTERN);
  patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN);
 }
 
 /**
  * Validate html with regular expression
  * 
  * @param html
  *            html content for validation
  * @return Vector links and link text
  */
 public Vector grabHTMLLinks(final String html) {
 
  Vector result = new Vector();
 
  matcherTag = patternTag.matcher(html);
 
  while (matcherTag.find()) {
 
   String href = matcherTag.group(1); // href
   String linkText = matcherTag.group(2); // link text
 
   matcherLink = patternLink.matcher(href);
 
   while (matcherLink.find()) {
 
    String link = matcherLink.group(1); // link
    HtmlLink obj = new HtmlLink();
    obj.setLink(link);
    obj.setLinkText(linkText);
 
    result.add(obj);
 
   }
 
  }
 
  return result;
 
 }
 
 class HtmlLink {
 
  String link;
  String linkText;
 
  HtmlLink(){};
 
  @Override
  public String toString() {
   return new StringBuffer("Link : ").append(this.link).append(" :: ")
   .append(" Link Text : ").append(this.linkText).toString();
  }
 
  public String getLink() {
   return link;
  }
 
  public void setLink(String link) {
   this.link = replaceInvalidChar(link);
  }
 
  public String getLinkText() {
   return linkText;
  }
 
  public void setLinkText(String linkText) {
   this.linkText = linkText;
  }
 
  private String replaceInvalidChar(String link){
   link = link.replaceAll("'", "");
   link = link.replaceAll("\"", "");
   return link;
  }
 
 }
 
 public static void main(String[] args) {
  try
  {
  //HTML RESPONSE 
  URL u = new URL("http://www.google.com");
  URLConnection conn = u.openConnection();
  BufferedReader in = new BufferedReader(
                          new InputStreamReader(
                              conn.getInputStream()));
  StringBuffer buffer = new StringBuffer();
  String inputLine;
  while ((inputLine = in.readLine()) != null) 
      buffer.append(inputLine);
  in.close();
  System.out.println("HTML RESPONSE:  "+buffer.toString());
  //HTML RESPONSE
  
  //LINK EXTRATOR
  LinkExt linkExt = new LinkExt();
  Vector links = linkExt.grabHTMLLinks(buffer.toString());
  for (int i = 0; i < links.size(); i++) {
   HtmlLink htmlLinks = links.get(i);
   System.out.println(htmlLinks);
  }
  //LINK EXTRACTOR
  
  }catch(Exception ex)
  {
   ex.printStackTrace();
  }
  
   
 }
}

0 comments:

Post a Comment

Related Posts Plugin for WordPress, Blogger...