Javascript Results Crawler & Go Server

NullPointer

JF-Expert Member
Feb 7, 2011
3,461
2,267
Leo matokeo ya ACSEE yametoka.
Ilikua ngumu kidogo kuyapata maana server ilikua very busy. Sasa kama kubrush kidogo JavaScript knowledge nimeandika crawler fasta fasta kuvuta pages zote then kucreate our own webpage na kuserve matokeo yote.

To see it in action fungua hapa, link itakua hewani for few hours only.
https://goo.gl/4HNnhw

Code:
const Request = require('request');
const fs = require('fs');
const path = require('path');

function getPage(number){
  console.log("Getting page..." + number);
  Request.get(`http://www.necta.go.tz/matokeo/ACSEE2017/results/${number}.htm`, function(err,res, body){
    if(err){
      setTimeout(function(){
        console.log("Trying again...." + number)
        //keep trying after 2 seconds coz the server is busy
        getPage(number)
      }, 2000);
    }else{
      savePageToDisk(number, body)
    }
  });
}

function savePageToDisk(number, body){
  fs.writeFile(path.join(__dirname,'matokeo',`${number}.htm`), body, function(err){
    if(err){
      console.log("Error saving page")
      console.log(err)
    }
  });
}

function startingPoint(){
  let k=0;
  //coz schools go up to 5500, a better way is to get index then just regex urls instead of trying all combinations
  for(let i=101; i<5500; i++){
    setTimeout(function(){
      getPage(FormatNumberLength(i))
      //execute every after 250 milliseconds
    }, k*250)
    k++
  }
}

function FormatNumberLength(num) {
    var r = "" + num;
    while (r.length < 4) {
        r = "0" + r;
    }
    //change s into p to get private candidate schools
    return "s" + r;
}

startingPoint()

Alafu kibishibishi ili kuwa na webpage ya kuyaserve yote.
Tunarudi kwenye golang, code yake ipo hivi

Code:
package main

import (
    "compress/gzip"
    "fmt"
    "io"
    "log"
    "net/http"
    "os"
    "path"

    "github.com/julienschmidt/httprouter"
)

func main() {
    router := httprouter.New()
    router.GET("/", handleIndex)
    router.GET("/matokeo/:filename", handleFiles)

    log.Print("Listening on 8080")
    http.ListenAndServe(":8080", router)
}

func handleIndex(w http.ResponseWriter, r *http.Request, _ httprouter.Params) {
    file, err := os.Open("./index.html")
    if err != nil {
        io.WriteString(w, "Kuna tatizo jaribu tena")
        return
    }
    defer file.Close()

    w.Header().Set("Content-Encoding", "gzip")
    w.Header().Set("Content-Type", "text/html")

    gzipWriter := gzip.NewWriter(w)
    defer gzipWriter.Close()
    io.Copy(gzipWriter, file)
}

func handleFiles(w http.ResponseWriter, r *http.Request, params httprouter.Params) {
    filename := params.ByName("filename")

    file, err := os.Open(path.Join("matokeo", filename))
    if err != nil {
        io.WriteString(w, "Kuna tatizo jaribu tena")
        return
    }
    defer file.Close()

    w.Header().Set("Content-Encoding", "gzip")
    w.Header().Set("Content-Type", "text/html")
    w.Header().Del("Content-Length")

    gzipWriter := gzip.NewWriter(w)
    defer gzipWriter.Close()
    io.Copy(gzipWriter, file)
}

The code could be better, ila it does its job.
Mfano hiyo gzip compression writer, kwenye real production app inakua kama middleware inapitishwa kwenye handleFunc zote automatically na sio kurudiarudia the same thing over and over au an even better method ni kuacha kuhangle gzip compression within the app na kurusha hiyo kazi yote kwa nginx, its much faster.

Ila tuache utani Go is beautiful.

Another thing, badala ya kucrawl from 101 to 5000 kuna so many 404 pages ambazo ni unnecessary, a better way ni kupata index.html then kutumia regex kupata urls zote directly then ku-crawl hizo urls. Ila at the time I'm writing this index.html haikuwepo coz ilikua ni system ya kuandika inakutafutia.

Kama upo curious, all this was done on a raspberry pi, hiyo go server nayo inarun from a raspberry pi, nimetumia ngrok kutunnel data through that domain, ndio maana unaona domain ya ajabu ajabu

Raspberry_Pi_3_2_of_4_2fe9737c-ff10-4b05-97fa-66e956393b9c_1024x1024.JPG
 
Back
Top Bottom