How to extract a table from HTML

Published: | Updated: | by Julian Knight Reading time ~4 min.
📖 Kb | 📎 Development | 🔖 node-red

While Node-RED has some nodes for extracting data from HTML, the nodes are rather simplistic. Here is a quick way to extract the data from an HTML table element.

While Node-RED has some nodes for extracting data from HTML, the nodes are rather simplistic.

Here is a quick way to extract the data from an HTML table element.

To use it, you need to install the cheerio node.js module to your user data folder (usually ~/.node-red).

Then you need to reference it in the global variables part of your settings.js file (same location as above).

    functionGlobalContext: {
        cheerio: require('cheerio'),
    },

Then add a flow that returns html from a web page (using the Request node).

Then add a function node with the following code:

/*jshint sub:true,asi:true,maxerr:1000*/

const tableSelector = '#ContentPlaceHolder1_grdStation'

const cheerio = global.get('cheerio')
const $ = cheerio.load(msg.payload)
const options = {
    rowForHeadings: 0,  // extract th cells from this row for column headings (zero-based)
    ignoreHeadingRow: true, // Don't tread the heading row as data
    ignoreRows: [],
}
const jsonReponse = []
const columnHeadings = []

$(tableSelector).each(function(i, table) {
    var trs = $(table).find('tr')

    // Set up the column heading names
    getColHeadings( $(trs[options.rowForHeadings]) )

    // Process rows for data
    $(table).find('tr').each(processRow)
})

msg.payload = {
    columnHeadings: columnHeadings,
    rows: jsonReponse,
}

return msg

function getColHeadings(headingRow) {
    const alreadySeen = {}

    $(headingRow).find('th').each(function(j, cell) {
        let tr = $(cell).text().trim()

        if ( alreadySeen[tr] ) {
            let suffix = ++alreadySeen[tr]
            tr = `${tr}_${suffix}`
        } else {
            alreadySeen[tr] = 1
        }

        columnHeadings.push(tr)
    })
}

function processRow(i, row) {
    const rowJson = {}

    if ( options.ignoreHeadingRow && i === options.rowForHeadings ) return
    // TODO: Process options.ignoreRows

    $(row).find('td').each(function(j, cell) {
        rowJson[ columnHeadings[j] ] = $(cell).text().trim()
    })

    // Skip blank rows
    if (JSON.stringify(rowJson) !== '{}') jsonReponse.push(rowJson)
}

//EOF

The result is a msg.payload containing 2 arrays. The first is the list of column names. The second is an array of objects, each object containing a property for each column.

Code inspired by: https://github.com/iaincollins/tabletojson/blob/master/lib/tabletojson.js and by a conversation in the Node-RED Google Group.

Example Flow 🔗︎

[{"id":"815bf28f.d0691","type":"inject","z":"e7463dd2.db517","name":"get data","topic":"","payload":"","payloadType":"date","repeat":"","crontab":"","once":false,"x":120,"y":100,"wires":[["46f8420f.a312dc"]]},{"id":"46f8420f.a312dc","type":"http request","z":"e7463dd2.db517","name":"","method":"GET","ret":"txt","url":"http://publicinfobanjir.water.gov.my/View/OnlineFloodInfo/PublicWaterLevel.aspx?scode=SEL","tls":"","x":330,"y":100,"wires":[["808d9c98.e305d"]]},{"id":"fe447848.e25938","type":"debug","z":"e7463dd2.db517","name":"","active":true,"complete":false,"x":810,"y":60,"wires":[]},{"id":"d0660df5.4f339","type":"function","z":"e7463dd2.db517","name":"","func":"/*jshint sub:true,asi:true,maxerr:1000*/\n\nconst tableSelector = '#ContentPlaceHolder1_grdStation'\n\nconst cheerio = global.get('cheerio')\nconst $ = cheerio.load(msg.payload)\nconst options = {\n    rowForHeadings: 0,  // extract th cells from this row for column headings (zero-based)\n    ignoreHeadingRow: true, // Don't tread the heading row as data\n    ignoreRows: [],\n}\nconst jsonReponse = []\nconst columnHeadings = []\n\n$(tableSelector).each(function(i, table) {\n    var trs = $(table).find('tr')\n    \n    // Set up the column heading names\n    getColHeadings( $(trs[options.rowForHeadings]) )\n\n    // Process rows for data\n    $(table).find('tr').each(processRow)\n})\n\nmsg.payload = {\n    columnHeadings: columnHeadings,\n    rows: jsonReponse,\n}\n\nreturn msg\n\nfunction getColHeadings(headingRow) {\n    const alreadySeen = {}\n    \n    $(headingRow).find('th').each(function(j, cell) {\n        let tr = $(cell).text().trim()\n        \n        if ( alreadySeen[tr] ) {\n            let suffix = ++alreadySeen[tr]\n            tr = `${tr}_${suffix}`\n        } else {\n            alreadySeen[tr] = 1\n        }\n        \n        columnHeadings.push(tr)\n    })\n}\n\nfunction processRow(i, row) {\n    const rowJson = {}\n    \n    if ( options.ignoreHeadingRow && i === options.rowForHeadings ) return\n    // TODO: Process options.ignoreRows\n    \n    $(row).find('td').each(function(j, cell) {\n        rowJson[ columnHeadings[j] ] = $(cell).text().trim()\n    })\n    \n    // Skip blank rows\n    if (JSON.stringify(rowJson) !== '{}') jsonReponse.push(rowJson)\n}\n\n//EOF","outputs":"1","noerr":0,"x":650,"y":100,"wires":[["fe447848.e25938","c35dcf95.5cc9","641996d3.fe8118"]]},{"id":"641996d3.fe8118","type":"change","z":"e7463dd2.db517","name":"","rules":[{"t":"set","p":"selected","pt":"msg","to":"{}","tot":"json"},{"t":"set","p":"selected.station","pt":"msg","to":"payload.rows[40]['Station ID']","tot":"msg"},{"t":"set","p":"selected.level","pt":"msg","to":"payload.rows[40]['River Level (m)']","tot":"msg"},{"t":"move","p":"selected","pt":"msg","to":"payload","tot":"msg"}],"action":"","property":"","from":"","to":"","reg":false,"x":820,"y":100,"wires":[["f0152260.70e1d"]]},{"id":"f0152260.70e1d","type":"debug","z":"e7463dd2.db517","name":"","active":true,"complete":false,"x":990,"y":100,"wires":[]},{"id":"c35dcf95.5cc9","type":"change","z":"e7463dd2.db517","name":"","rules":[{"t":"set","p":"payload","pt":"msg","to":"$$.payload.rows.(\t   {\t       \"station\": $.'Station ID',\t       \"name\": $.'Station Name',\t       \"level\": $.'River Level (m)',\t       \"alert\": $number($.'River Level (m)') > $number($.Alert) ? true : false\t   }\t)","tot":"jsonata"}],"action":"","property":"","from":"","to":"","reg":false,"x":820,"y":140,"wires":[["d505285a.26bcb8"]]},{"id":"d505285a.26bcb8","type":"debug","z":"e7463dd2.db517","name":"","active":true,"complete":false,"x":990,"y":140,"wires":[]},{"id":"808d9c98.e305d","type":"function","z":"e7463dd2.db517","name":"cache","func":"/*jshint sub:true,asi:true,maxerr:1000*/\n// Expects input msgs with topic set \n\nconst contextVarName = 'httpReqMsgs' // homeMsgs\n\n// saved context\nvar cachedMsgs = context.get(contextVarName) || {}\n\n// Only send to single client if needed\nvar socketId = null\nif ( msg.hasOwnProperty('_socketId') ) {\n    socketId = msg._socketId\n}\n\n// Replay cache if requested\nif ( msg.hasOwnProperty('cacheControl') && msg.cacheControl.toUpperCase() === 'REPLAY' ) {\n    for (var topic in cachedMsgs) {\n        let newMsg = {\n            \"topic\": topic, \n            \"payload\": cachedMsgs[topic]\n        }\n        // Only send to a single client if we can\n        if ( socketId !== null ) newMsg._socketId = socketId\n        node.send(newMsg)\n    }\n    return null\n}\n// -- else if --\n// Empty cache if requested\nif ( (msg.hasOwnProperty('cacheControl') && msg.cacheControl === 'RESET')  ||\n     (msg.payload.hasOwnProperty('cacheControl') && msg.payload.cacheControl === 'RESET') ) {\n    cachedMsgs = {}\n    context.set(contextVarName, cachedMsgs)\n    return null\n}\n// -- else --\n\n// ignore cacheControl and uibuilder control messages\nif ( msg.hasOwnProperty('cacheControl') || msg.hasOwnProperty('uibuilderCtrl') ) return null\n\n// Add a counter for each device name\nif ( msg.topic.endsWith('$name') ) {\n    let topic = msg.topic.replace('$name', '$count')\n    let count = cachedMsgs[topic] || 0\n    count = count + 1\n    cachedMsgs[topic] = count\n    let newMsg = {\n        \"topic\": topic, \n        \"payload\": count\n    }\n    // Only send to a single client if we can\n    if ( socketId !== null ) newMsg._socketId = socketId\n    node.send(newMsg)\n}\n\n// Keep the last msg.payload by topic\ncachedMsgs[msg.topic] = msg.payload\n\n// save context for next time\ncontext.set(contextVarName, cachedMsgs)\n\nreturn msg;","outputs":1,"noerr":0,"x":510,"y":100,"wires":[["d0660df5.4f339","c0cacccb.9e3c7"]]},{"id":"afc218de.706f88","type":"inject","z":"e7463dd2.db517","name":"replay","topic":"","payload":"","payloadType":"str","repeat":"","crontab":"","once":false,"x":110,"y":160,"wires":[["eddcc57e.277418"]]},{"id":"c0cacccb.9e3c7","type":"debug","z":"e7463dd2.db517","name":"","active":false,"complete":"true","x":650,"y":180,"wires":[]},{"id":"eddcc57e.277418","type":"change","z":"e7463dd2.db517","name":"","rules":[{"t":"set","p":"cacheControl","pt":"msg","to":"REPLAY","tot":"str"}],"action":"","property":"","from":"","to":"","reg":false,"x":300,"y":160,"wires":[["808d9c98.e305d"]]}]

comments powered by Disqus