Skip to content

Instantly share code, notes, and snippets.

@planbnet
Created March 16, 2010 07:51
Show Gist options
  • Select an option

  • Save planbnet/333736 to your computer and use it in GitHub Desktop.

Select an option

Save planbnet/333736 to your computer and use it in GitHub Desktop.

Revisions

  1. planbnet revised this gist Mar 16, 2010. 3 changed files with 146 additions and 138 deletions.
    145 changes: 145 additions & 0 deletions genericxpath.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,145 @@
    // Bookmarklet to calculate the most generic xpath for the current selection
    // (helper utility for scraping websites)

    function selectionxpath() {

    function calculateShortestXpathOfElement( sel ) {
    var node = sel;
    var nextId = null;
    var stop = null;
    var xpath = "";

    //find next element with an id
    while (true) {
    if (node.id && node.id != "") {
    nextId = node.id;
    break;
    }
    node = node.parentNode;
    if (node == stop) break;
    }

    if (nextId != null) {
    xpath = "//" + node.tagName.toLowerCase() + "[@id='" + nextId + "']";
    if ( node == sel ) {
    return xpath;
    } else {
    stop = node;
    }
    }

    //find next element with unique tag+class
    node = sel;
    var nextUniqueClass = null;
    while (true) {
    if (node.nodeType === 1) {
    var styleClass = node.getAttribute("class");

    if (styleClass != null) {
    var tmpXpath = xpath+"//"+node.tagName+"[@class='"+styleClass+"']";
    var tempResult = document.evaluate(tmpXpath, sel.ownerDocument, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
    if (tempResult.singleNodeValue == node) {
    nextUniqueClass = styleClass;
    break;
    }
    }
    }
    node = node.parentNode;
    if (node == stop) break;
    }
    if (nextUniqueClass != null) {
    xpath += "//"+node.tagName.toLowerCase()+"[@class='"+nextUniqueClass+"']";
    if ( node == sel ) {
    return xpath;
    } else {
    stop = node;
    }
    }

    //find next element with unique tag
    node = sel;
    var nextUniqueTag = null;
    while (true) {
    if (node.nodeType === 1) {
    var tmpXpath = xpath+"//"+node.tagName;
    var tempResult = document.evaluate(tmpXpath, sel.ownerDocument, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
    if (tempResult.singleNodeValue == node) {
    nextUniqueTag = node.tagName;
    break;
    }
    }
    node = node.parentNode;
    if (node == stop) break;
    }
    if (nextUniqueTag != null) {
    xpath += "//"+node.tagName.toLowerCase();
    if ( node == sel ) {
    return xpath;
    } else {
    stop = node;
    }
    }

    //get absolute path for the rest
    var restPath = "";
    for (node = sel; node && node.nodeType == 1; node = node.parentNode) {
    if (node == stop) break;
    var idx = 1;
    for (var sib = node.previousSibling; sib ; sib = sib.previousSibling) {
    if(sib.nodeType == 1 && sib.tagName == node.tagName) idx++;
    }
    var xname = node.tagName.toLowerCase();
    if (idx > 1) xname += "[" + idx + "]";
    restPath = "/" + xname + restPath;
    }

    var result = xpath + restPath;
    return result;
    }

    function depthOf( el ) {
    i = 0;
    while (el) {
    el = el.parentNode;
    i++;
    }
    return i;
    }

    function calculateShortestXpathOfSelection() {
    var sel = window.getSelection().getRangeAt(0);
    if (!sel) return null;
    var start = sel.startContainer;
    var end = sel.endContainer;

    var i = depthOf( start );
    var j = depthOf( end );

    while (start != end && i != 0 && j != 0) {
    if (i > j) {
    start = start.parentNode;
    i--;
    } else {
    end = end.parentNode;
    j--;
    }
    }

    return calculateShortestXpathOfElement(start);
    }

    var xpath = calculateShortestXpathOfSelection();
    var node = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
    var border = node.style.border;
    if (!border) border = "";
    node.style.border = "2px dashed red";
    if (xpath) {
    prompt("Most generic xpath for selection:", xpath);
    node.style.border = border;
    } else {
    alert("Could not determine generic xpath for selection");
    }

    }

    selectionxpath();
    138 changes: 0 additions & 138 deletions most generic selection xpath javascript
    Original file line number Diff line number Diff line change
    @@ -1,138 +0,0 @@
    function selectionxpath() {

    function calculateShortestXpathOfElement( sel ) {
    var node = sel;
    var nextId = null;
    var stop = null;
    var xpath = "";

    //find next element with an id
    while (true) {
    if (node.id && node.id != "") {
    nextId = node.id;
    break;
    }
    node = node.parentNode;
    if (node == stop) break;
    }

    if (nextId != null) {
    xpath = "//" + node.tagName.toLowerCase() + "[@id='" + nextId + "']";
    if ( node == sel ) {
    return xpath;
    } else {
    stop = node;
    }
    }

    //find next element with unique tag+class
    node = sel;
    var nextUniqueClass = null;
    while (true) {
    if (node.nodeType === 1) {
    var styleClass = node.getAttribute("class");

    if (styleClass != null) {
    var tmpXpath = xpath+"//"+node.tagName+"[@class='"+styleClass+"']";
    var tempResult = document.evaluate(tmpXpath, sel.ownerDocument, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
    if (tempResult.singleNodeValue == node) {
    nextUniqueClass = styleClass;
    break;
    }
    }
    }
    node = node.parentNode;
    if (node == stop) break;
    }
    if (nextUniqueClass != null) {
    xpath += "//"+node.tagName.toLowerCase()+"[@class='"+nextUniqueClass+"']";
    if ( node == sel ) {
    return xpath;
    } else {
    stop = node;
    }
    }

    //find next element with unique tag
    node = sel;
    var nextUniqueTag = null;
    while (true) {
    if (node.nodeType === 1) {
    var tmpXpath = xpath+"//"+node.tagName;
    var tempResult = document.evaluate(tmpXpath, sel.ownerDocument, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
    if (tempResult.singleNodeValue == node) {
    nextUniqueTag = node.tagName;
    break;
    }
    }
    node = node.parentNode;
    if (node == stop) break;
    }
    if (nextUniqueTag != null) {
    xpath += "//"+node.tagName.toLowerCase();
    if ( node == sel ) {
    return xpath;
    } else {
    stop = node;
    }
    }

    //get absolute path for the rest
    var restPath = "";
    for (node = sel; node && node.nodeType == 1; node = node.parentNode) {
    if (node == stop) break;
    var idx = 1;
    for (var sib = node.previousSibling; sib ; sib = sib.previousSibling) {
    if(sib.nodeType == 1 && sib.tagName == node.tagName) idx++;
    }
    var xname = node.tagName.toLowerCase();
    if (idx > 1) xname += "[" + idx + "]";
    restPath = "/" + xname + restPath;
    }

    var result = xpath + restPath;
    return result;
    }

    function depthOf( el ) {
    i = 0;
    while (el) {
    el = el.parentNode;
    i++;
    }
    return i;
    }

    function calculateShortestXpathOfSelection() {
    var sel = window.getSelection().getRangeAt(0);
    if (!sel) return null;
    var start = sel.startContainer;
    var end = sel.endContainer;

    var i = depthOf( start );
    var j = depthOf( end );

    while (start != end && i != 0 && j != 0) {
    if (i > j) {
    start = start.parentNode;
    i--;
    } else {
    end = end.parentNode;
    j--;
    }
    }

    return calculateShortestXpathOfElement(start);
    }


    var xpath = calculateShortestXpathOfSelection();
    if (xpath) {
    prompt("Most generic xpath for selection:", xpath);
    } else {
    alert("Could not determine generic xpath for selection");
    }

    }

    selectionxpath();
    1 change: 1 addition & 0 deletions selectionxpath.html
    Original file line number Diff line number Diff line change
    @@ -0,0 +1 @@
    <html><body><p><a href="javascript:function%20selectionxpath()%20{function%20calculateShortestXpathOfElement(%20sel%20)%20{var%20node%20=%20sel;var%20nextId%20=%20null;var%20stop%20=%20null;var%20xpath%20=%20%22%22;while%20(true)%20{if%20(node.id%20&&%20node.id%20!=%20%22%22)%20{nextId%20=%20node.id;break;}node%20=%20node.parentNode;if%20(node%20==%20stop)%20break;}if%20(nextId%20!=%20null)%20{xpath%20=%20%22//%22%20+%20node.tagName.toLowerCase()%20+%20%22[@id=%27%22%20+%20nextId%20+%20%22%27]%22;if%20(%20node%20==%20sel%20)%20{return%20xpath;}%20else%20{stop%20=%20node;}}node%20=%20sel;var%20nextUniqueClass%20=%20null;while%20(true)%20{if%20(node.nodeType%20===%201)%20{var%20styleClass%20=%20node.getAttribute(%22class%22);if%20(styleClass%20!=%20null)%20{var%20tmpXpath%20=%20xpath+%22//%22+node.tagName+%22[@class=%27%22+styleClass+%22%27]%22;var%20tempResult%20=%20document.evaluate(tmpXpath,%20sel.ownerDocument,%20null,%20XPathResult.FIRST_ORDERED_NODE_TYPE,%20null);if%20(tempResult.singleNodeValue%20==%20node)%20{nextUniqueClass%20=%20styleClass;break;}}}node%20=%20node.parentNode;if%20(node%20==%20stop)%20break;}if%20(nextUniqueClass%20!=%20null)%20{xpath%20+=%20%22//%22+node.tagName.toLowerCase()+%22[@class=%27%22+nextUniqueClass+%22%27]%22;if%20(%20node%20==%20sel%20)%20{return%20xpath;}%20else%20{stop%20=%20node;}}node%20=%20sel;var%20nextUniqueTag%20=%20null;while%20(true)%20{if%20(node.nodeType%20===%201)%20{var%20tmpXpath%20=%20xpath+%22//%22+node.tagName;var%20tempResult%20=%20document.evaluate(tmpXpath,%20sel.ownerDocument,%20null,%20XPathResult.FIRST_ORDERED_NODE_TYPE,%20null);if%20(tempResult.singleNodeValue%20==%20node)%20{nextUniqueTag%20=%20node.tagName;break;}}node%20=%20node.parentNode;if%20(node%20==%20stop)%20break;}if%20(nextUniqueTag%20!=%20null)%20{xpath%20+=%20%22//%22+node.tagName.toLowerCase();if%20(%20node%20==%20sel%20)%20{return%20xpath;}%20else%20{stop%20=%20node;}}var%20restPath%20=%20%22%22;for%20(node%20=%20sel;%20node%20&&%20node.nodeType%20==%201;%20node%20=%20node.parentNode)%20{if%20(node%20==%20stop)%20break;var%20idx%20=%201;for%20(var%20sib%20=%20node.previousSibling;%20sib%20;%20sib%20=%20sib.previousSibling)%20{if(sib.nodeType%20==%201%20&&%20sib.tagName%20==%20node.tagName)%20idx++;}var%20xname%20=%20node.tagName.toLowerCase();if%20(idx%20>%201)%20xname%20+=%20%22[%22%20+%20idx%20+%20%22]%22;restPath%20=%20%22/%22%20+%20xname%20+%20restPath;}var%20result%20=%20xpath%20+%20restPath;return%20result;}function%20depthOf(%20el%20)%20{i%20=%200;while%20(el)%20{el%20=%20el.parentNode;i++;}return%20i;}function%20calculateShortestXpathOfSelection()%20{var%20sel%20=%20window.getSelection().getRangeAt(0);if%20(!sel)%20return%20null;var%20start%20=%20sel.startContainer;var%20end%20=%20sel.endContainer;var%20i%20=%20depthOf(%20start%20);var%20j%20=%20depthOf(%20end%20);while%20(start%20!=%20end%20&&%20i%20!=%200%20&&%20j%20!=%200)%20{if%20(i%20>%20j)%20{start%20=%20start.parentNode;i--;}%20else%20{end%20=%20end.parentNode;j--;}}return%20calculateShortestXpathOfElement(start);}var%20xpath%20=%20calculateShortestXpathOfSelection();var%20node%20=%20document.evaluate(xpath,%20document,%20null,%20XPathResult.FIRST_ORDERED_NODE_TYPE,%20null).singleNodeValue;var%20border%20=%20node.style.border;if%20(!border)%20border%20=%20%22%22;node.style.border%20=%20%222px%20dashed%20red%22;if%20(xpath)%20{prompt(%22Most%20generic%20xpath%20for%20selection:%22,%20xpath);node.style.border%20=%20border;}%20else%20{alert(%22Could%20not%20determine%20generic%20xpath%20for%20selection%22);}}selectionxpath();">Selection XPath</a></p><p/><p>Drag the link to you bookmark bar</p></body></html>
  2. planbnet created this gist Mar 16, 2010.
    138 changes: 138 additions & 0 deletions most generic selection xpath javascript
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,138 @@
    function selectionxpath() {

    function calculateShortestXpathOfElement( sel ) {
    var node = sel;
    var nextId = null;
    var stop = null;
    var xpath = "";

    //find next element with an id
    while (true) {
    if (node.id && node.id != "") {
    nextId = node.id;
    break;
    }
    node = node.parentNode;
    if (node == stop) break;
    }

    if (nextId != null) {
    xpath = "//" + node.tagName.toLowerCase() + "[@id='" + nextId + "']";
    if ( node == sel ) {
    return xpath;
    } else {
    stop = node;
    }
    }

    //find next element with unique tag+class
    node = sel;
    var nextUniqueClass = null;
    while (true) {
    if (node.nodeType === 1) {
    var styleClass = node.getAttribute("class");

    if (styleClass != null) {
    var tmpXpath = xpath+"//"+node.tagName+"[@class='"+styleClass+"']";
    var tempResult = document.evaluate(tmpXpath, sel.ownerDocument, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
    if (tempResult.singleNodeValue == node) {
    nextUniqueClass = styleClass;
    break;
    }
    }
    }
    node = node.parentNode;
    if (node == stop) break;
    }
    if (nextUniqueClass != null) {
    xpath += "//"+node.tagName.toLowerCase()+"[@class='"+nextUniqueClass+"']";
    if ( node == sel ) {
    return xpath;
    } else {
    stop = node;
    }
    }

    //find next element with unique tag
    node = sel;
    var nextUniqueTag = null;
    while (true) {
    if (node.nodeType === 1) {
    var tmpXpath = xpath+"//"+node.tagName;
    var tempResult = document.evaluate(tmpXpath, sel.ownerDocument, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
    if (tempResult.singleNodeValue == node) {
    nextUniqueTag = node.tagName;
    break;
    }
    }
    node = node.parentNode;
    if (node == stop) break;
    }
    if (nextUniqueTag != null) {
    xpath += "//"+node.tagName.toLowerCase();
    if ( node == sel ) {
    return xpath;
    } else {
    stop = node;
    }
    }

    //get absolute path for the rest
    var restPath = "";
    for (node = sel; node && node.nodeType == 1; node = node.parentNode) {
    if (node == stop) break;
    var idx = 1;
    for (var sib = node.previousSibling; sib ; sib = sib.previousSibling) {
    if(sib.nodeType == 1 && sib.tagName == node.tagName) idx++;
    }
    var xname = node.tagName.toLowerCase();
    if (idx > 1) xname += "[" + idx + "]";
    restPath = "/" + xname + restPath;
    }

    var result = xpath + restPath;
    return result;
    }

    function depthOf( el ) {
    i = 0;
    while (el) {
    el = el.parentNode;
    i++;
    }
    return i;
    }

    function calculateShortestXpathOfSelection() {
    var sel = window.getSelection().getRangeAt(0);
    if (!sel) return null;
    var start = sel.startContainer;
    var end = sel.endContainer;

    var i = depthOf( start );
    var j = depthOf( end );

    while (start != end && i != 0 && j != 0) {
    if (i > j) {
    start = start.parentNode;
    i--;
    } else {
    end = end.parentNode;
    j--;
    }
    }

    return calculateShortestXpathOfElement(start);
    }


    var xpath = calculateShortestXpathOfSelection();
    if (xpath) {
    prompt("Most generic xpath for selection:", xpath);
    } else {
    alert("Could not determine generic xpath for selection");
    }

    }

    selectionxpath();