From 602dcb4ec617634d1fed182ac0309123992e43c6 Mon Sep 17 00:00:00 2001 From: Dmitry Ilvokhin Date: Sun, 23 Feb 2025 16:20:33 +0000 Subject: Implement URL fetching and page parsing --- .gitattributes | 1 + TODO.txt | 20 + flatbot.go | 111 ++ flatbot_test.go | 35 + go.mod | 5 + go.sum | 2 + htmls/2025-02-19-isle-of-dogs.html | 2251 ++++++++++++++++++++++++++++++++++++ 7 files changed, 2425 insertions(+) create mode 100644 .gitattributes create mode 100644 TODO.txt create mode 100644 flatbot.go create mode 100644 flatbot_test.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 htmls/2025-02-19-isle-of-dogs.html diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..823e65b --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.html binary diff --git a/TODO.txt b/TODO.txt new file mode 100644 index 0000000..1cf6e0e --- /dev/null +++ b/TODO.txt @@ -0,0 +1,20 @@ +TODO + + +* Input is URL/URLs for rightmove search. URL/URLs are passed as positional + arguments to main binary. Same way as for curl or wget tools. +* Binary can run forever or just once (--once option). +* There is an --interval option to control fetch frequency. +* After each iteration binary will dump "seen" set as a JSON. +* Option to specify a path to seen list file. + + +GENERAL ALGORITHM + +* Fetch URL. +* Parse flats into slice of structs. +* Remove already seen flats. +* Send new flats to telegram. One message per each flat? +* Add new flats to seen list. +* Remove out of retention flats from seen list to prevent it from growing + indefinitely. diff --git a/flatbot.go b/flatbot.go new file mode 100644 index 0000000..ba76595 --- /dev/null +++ b/flatbot.go @@ -0,0 +1,111 @@ +package main + +import ( + "bytes" + "errors" + "fmt" + "io" + "log" + "net/http" + "strings" + + "golang.org/x/net/html" +) + +func main() { + url := "http://localhost:8000/2025-02-19-isle-of-dogs.html" + body, err := fetch(url) + if err != nil { + log.Fatal(err) + } + flats, err := parse(body) + if err != nil { + log.Fatal(err) + } + fmt.Println(flats) +} + +func fetch(url string) ([]byte, error) { + resp, err := http.Get(url) + if err != nil { + return make([]byte, 0), err + } + if resp.StatusCode != http.StatusOK { + return make([]byte, 0), + fmt.Errorf("Bad response status: %d", resp.StatusCode) + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + if err != nil { + return make([]byte, 0), err + } + return body, nil +} + +type flat struct { + URL string + Price string +} + +func parse(body []byte) ([]flat, error) { + doc, err := html.Parse(bytes.NewReader(body)) + if err != nil { + return make([]flat, 0), err + } + flats := make([]flat, 0) + for _, n := range findNodes(doc) { + flat, err := parseNode(n) + if err != nil { + continue + } + flats = append(flats, flat) + } + return flats, nil +} + +func findNodes(root *html.Node) []*html.Node { + flats := make([]*html.Node, 0) + for n := range root.Descendants() { + if n.Type != html.ElementNode { + continue + } + if n.Data != "a" { + continue + } + attr := matchAttr(n, "data-testid") + if attr == nil || attr.Val != "property-price" { + continue + } + flats = append(flats, n) + } + return flats +} + +func matchAttr(n *html.Node, key string) *html.Attribute { + for _, attr := range n.Attr { + if attr.Key == key { + return &attr + } + } + return nil +} + +func parseNode(root *html.Node) (flat, error) { + url := matchAttr(root, "href") + if url == nil { + return flat{}, errors.New("Couldn't find URL") + } + f := flat{URL: makeURL(url.Val), Price: ""} + for n := range root.Descendants() { + if price, found := strings.CutSuffix(n.Data, " pcm"); found { + f.Price = price + return f, nil + } + } + return flat{}, errors.New("Couldn't find price") +} + +func makeURL(path string) string { + prettySuffix, _ := strings.CutSuffix(path, "/?channel=RES_LET") + return fmt.Sprintf("https://rightmove.co.uk%v", prettySuffix) +} diff --git a/flatbot_test.go b/flatbot_test.go new file mode 100644 index 0000000..0308295 --- /dev/null +++ b/flatbot_test.go @@ -0,0 +1,35 @@ +package main + +import ( + "os" + "reflect" + "testing" +) + +func TestParse(t *testing.T) { + filename := "htmls/2025-02-19-isle-of-dogs.html" + data, err := os.ReadFile(filename) + if err != nil { + t.Errorf("Could not read %v", filename) + } + want := []flat{ + flat{ + URL: "https://rightmove.co.uk/properties/156522206#", + Price: "£2,500", + }, + flat{ + URL: "https://rightmove.co.uk/properties/158462822#", + Price: "£3,000", + }, + flat{ + URL: "https://rightmove.co.uk/properties/157948184#", + Price: "£2,400", + }} + got, err := parse(data) + if err != nil { + t.Fatal(err) + } + if !reflect.DeepEqual(want, got) { + t.Errorf("Parse failed: got: %v, want: %v", want, got) + } +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..5bd8d2f --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module ilvokhin.com/flatbot + +go 1.24.0 + +require golang.org/x/net v0.35.0 // indirect diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..f4761f9 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8= +golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk= diff --git a/htmls/2025-02-19-isle-of-dogs.html b/htmls/2025-02-19-isle-of-dogs.html new file mode 100644 index 0000000..13b04dd --- /dev/null +++ b/htmls/2025-02-19-isle-of-dogs.html @@ -0,0 +1,2251 @@ +Properties To Rent in Isle Of Dogs | Rightmove
+ +
+ + + + +
+
+ + + + + + + +
+
+
+ +
+ + + +
+ + + + + + + + + + +
+
+ + + log_in + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + +
+
+
to
to
Area

Price
-

Bedrooms
-

Property type

Furnished type

Type of let

Property details

Must haves

Don’t show

3 results

Terraced Houses To Rent in Isle Of Dogs, East London, 2 – 3 beds, up to £3,000, added in the last 24 hours

+ + + +
\ No newline at end of file -- cgit v1.2.3-70-g09d2