Browse Source

add get_title/1 and get_html/1 to scraper

pull/26/head
Inhji Y. 1 year ago
parent
commit
bd42faced9
  1. 3
      apps/bookmarks/mix.exs
  2. 32
      apps/scraper/lib/scraper.ex
  3. 6
      apps/scraper/mix.exs
  4. 47
      apps/scraper/test/data/example.com.html
  5. 14
      apps/scraper/test/scraper_test.exs
  6. 6
      config/config.exs
  7. 12
      mix.lock

3
apps/bookmarks/mix.exs

@ -27,7 +27,8 @@ defmodule Bookmarks.MixProject do
[
{:ecto_sql, "~> 3.1"},
{:db, in_umbrella: true},
{:tags, in_umbrella: true}
{:tags, in_umbrella: true},
{:scraper, in_umbrella: true}
]
end
end

32
apps/scraper/lib/scraper.ex

@ -3,16 +3,32 @@ defmodule Scraper do
Documentation for `Scraper`.
"""
@doc """
Hello world.
def request_headers() do
user_agent = Application.get_env(:tomie, :user_agent)
## Examples
[
{"User-Agent", user_agent}
]
end
iex> Scraper.hello()
:world
def get_html(url) do
case HTTPoison.get(url, request_headers()) do
{:ok, %{body: body, status_code: 200}} ->
{:ok, body}
"""
def hello do
:world
{:ok, status_code: status_code} ->
{:error, status_code}
{:error, error} ->
{:error, error}
end
end
def get_title(html) do
{:ok, document} = Floki.parse_document(html)
document
|> Floki.find("head title")
|> Floki.text()
end
end

6
apps/scraper/mix.exs

@ -25,9 +25,9 @@ defmodule Scraper.MixProject do
# Run "mix help deps" to learn about dependencies.
defp deps do
[
# {:dep_from_hexpm, "~> 0.3.0"},
# {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"},
# {:sibling_app_in_umbrella, in_umbrella: true}
{:httpoison, "~> 1.6"},
{:floki, "~> 0.26.0"},
{:fast_html, "~> 1.0"}
]
end
end

47
apps/scraper/test/data/example.com.html

@ -0,0 +1,47 @@
<!doctype html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8" />
<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<style type="text/css">
body {
background-color: #f0f0f2;
margin: 0;
padding: 0;
font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
}
div {
width: 600px;
margin: 5em auto;
padding: 2em;
background-color: #fdfdff;
border-radius: 0.5em;
box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
}
a:link, a:visited {
color: #38488f;
text-decoration: none;
}
@media (max-width: 700px) {
div {
margin: 0 auto;
width: auto;
}
}
</style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples in documents. You may use this
domain in literature without prior coordination or asking for permission.</p>
<p><a href="https://www.iana.org/domains/example">More information...</a></p>
</div>
</body>
</html>

14
apps/scraper/test/scraper_test.exs

@ -2,7 +2,17 @@ defmodule ScraperTest do
use ExUnit.Case
doctest Scraper
test "greets the world" do
assert Scraper.hello() == :world
describe "with fake html" do
test "get_title/1 extracts the website title from the given html" do
html = File.read!("./test/data/example.com.html")
assert Scraper.get_title(html) == "Example Domain"
end
end
describe "with real html" do
test "get_title/1 extracts the website title from the given html" do
{:ok, html} = Scraper.get_html("http://example.com")
assert Scraper.get_title(html) == "Example Domain"
end
end
end

6
config/config.exs

@ -13,6 +13,9 @@ use Mix.Config
config :db,
ecto_repos: [Db.Repo]
config :tomie,
user_agent: "Tomie/0.x (https://inhji.de)"
config :tomie_web,
ecto_repos: [Db.Repo],
generators: [context_app: :tomie]
@ -38,6 +41,9 @@ config :logger, :console,
# Use Jason for JSON parsing in Phoenix
config :phoenix, :json_library, Jason
# Use fast-html for HTML parsing
config :floki, :html_parser, Floki.HTMLParser.FastHtml
# Import environment specific config. This must remain at the bottom
# of this file so it overrides the configuration defined above.
import_config "#{Mix.env()}.exs"

12
mix.lock

@ -1,4 +1,5 @@
%{
"certifi": {:hex, :certifi, "2.5.1", "867ce347f7c7d78563450a18a6a28a8090331e77fa02380b4a21962a65d36ee5", [:rebar3], [{:parse_trans, "~>3.3", [hex: :parse_trans, repo: "hexpm", optional: false]}], "hexpm", "805abd97539caf89ec6d4732c91e62ba9da0cda51ac462380bbd28ee697a8c42"},
"connection": {:hex, :connection, "1.0.4", "a1cae72211f0eef17705aaededacac3eb30e6625b04a6117c1b2db6ace7d5976", [:mix], [], "hexpm", "4a0850c9be22a43af9920a71ab17c051f5f7d45c209e40269a1938832510e4d9"},
"cowboy": {:hex, :cowboy, "2.7.0", "91ed100138a764355f43316b1d23d7ff6bdb0de4ea618cb5d8677c93a7a2f115", [:rebar3], [{:cowlib, "~> 2.8.0", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, "~> 1.7.1", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "04fd8c6a39edc6aaa9c26123009200fc61f92a3a94f3178c527b70b767c6e605"},
"cowlib": {:hex, :cowlib, "2.8.0", "fd0ff1787db84ac415b8211573e9a30a3ebe71b5cbff7f720089972b2319c8a4", [:rebar3], [], "hexpm", "79f954a7021b302186a950a32869dbc185523d99d3e44ce430cd1f3289f41ed4"},
@ -7,10 +8,19 @@
"ecto": {:hex, :ecto, "3.4.0", "a7a83ab8359bf816ce729e5e65981ce25b9fc5adfc89c2ea3980f4fed0bfd7c1", [:mix], [{:decimal, "~> 1.6 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}], "hexpm", "5eed18252f5b5bbadec56a24112b531343507dbe046273133176b12190ce19cc"},
"ecto_autoslug_field": {:hex, :ecto_autoslug_field, "2.0.1", "2177c1c253f6dd3efd4b56d1cb76104d0a6ef044c6b9a7a0ad6d32665c4111e5", [:mix], [{:ecto, ">= 2.1.0", [hex: :ecto, repo: "hexpm", optional: false]}, {:slugger, ">= 0.2.0", [hex: :slugger, repo: "hexpm", optional: false]}], "hexpm", "a3cc73211f2e75b89a03332183812ebe1ac08be2e25a1df5aa3d1422f92c45c3"},
"ecto_sql": {:hex, :ecto_sql, "3.4.1", "3c9136ba138f9b74d31286c73c61232a92bd19385f7c5607bdeb3a4587ef91f5", [:mix], [{:db_connection, "~> 2.2", [hex: :db_connection, repo: "hexpm", optional: false]}, {:ecto, "~> 3.4.0", [hex: :ecto, repo: "hexpm", optional: false]}, {:myxql, "~> 0.3.0 or ~> 0.4.0", [hex: :myxql, repo: "hexpm", optional: true]}, {:postgrex, "~> 0.15.0", [hex: :postgrex, repo: "hexpm", optional: true]}, {:tds, "~> 2.1.0", [hex: :tds, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "9b4be0bffe7b0bdf5393defcae52712f248e70cc2bc0e8ab6ddb03be66371516"},
"fast_html": {:hex, :fast_html, "1.0.3", "2cc0d4b68496266a1530e0c852cafeaede0bd10cfdee26fda50dc696c203162f", [:make, :mix], [], "hexpm", "ab3d782b639d3c4655fbaec0f9d032c91f8cab8dd791ac7469c2381bc7c32f85"},
"file_system": {:hex, :file_system, "0.2.8", "f632bd287927a1eed2b718f22af727c5aeaccc9a98d8c2bd7bff709e851dc986", [:mix], [], "hexpm", "97a3b6f8d63ef53bd0113070102db2ce05352ecf0d25390eb8d747c2bde98bca"},
"floki": {:hex, :floki, "0.26.0", "4df88977e2e357c6720e1b650f613444bfb48c5acfc6a0c646ab007d08ad13bf", [:mix], [{:html_entities, "~> 0.5.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm", "e7b66ce7feef5518a9cd9fc7b52dd62a64028bd9cb6d6ad282a0f0fc90a4ae52"},
"gettext": {:hex, :gettext, "0.17.4", "f13088e1ec10ce01665cf25f5ff779e7df3f2dc71b37084976cf89d1aa124d5c", [:mix], [], "hexpm", "3c75b5ea8288e2ee7ea503ff9e30dfe4d07ad3c054576a6e60040e79a801e14d"},
"hackney": {:hex, :hackney, "1.15.2", "07e33c794f8f8964ee86cebec1a8ed88db5070e52e904b8f12209773c1036085", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.5", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm", "e0100f8ef7d1124222c11ad362c857d3df7cb5f4204054f9f0f4a728666591fc"},
"html_entities": {:hex, :html_entities, "0.5.1", "1c9715058b42c35a2ab65edc5b36d0ea66dd083767bef6e3edb57870ef556549", [:mix], [], "hexpm", "30efab070904eb897ff05cd52fa61c1025d7f8ef3a9ca250bc4e6513d16c32de"},
"httpoison": {:hex, :httpoison, "1.6.2", "ace7c8d3a361cebccbed19c283c349b3d26991eff73a1eaaa8abae2e3c8089b6", [:mix], [{:hackney, "~> 1.15 and >= 1.15.2", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "aa2c74bd271af34239a3948779612f87df2422c2fdcfdbcec28d9c105f0773fe"},
"idna": {:hex, :idna, "6.0.0", "689c46cbcdf3524c44d5f3dde8001f364cd7608a99556d8fbd8239a5798d4c10", [:rebar3], [{:unicode_util_compat, "0.4.1", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "4bdd305eb64e18b0273864920695cb18d7a2021f31a11b9c5fbcd9a253f936e2"},
"jason": {:hex, :jason, "1.2.0", "10043418c42d2493d0ee212d3fddd25d7ffe484380afad769a0a38795938e448", [:mix], [{:decimal, "~> 1.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "116747dbe057794c3a3e4e143b7c8390b29f634e16c78a7f59ba75bfa6852e7f"},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
"mime": {:hex, :mime, "1.3.1", "30ce04ab3175b6ad0bdce0035cba77bba68b813d523d1aac73d9781b4d193cf8", [:mix], [], "hexpm", "6cbe761d6a0ca5a31a0931bf4c63204bceb64538e664a8ecf784a9a6f3b875f1"},
"mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
"parse_trans": {:hex, :parse_trans, "3.3.0", "09765507a3c7590a784615cfd421d101aec25098d50b89d7aa1d66646bc571c1", [:rebar3], [], "hexpm", "17ef63abde837ad30680ea7f857dd9e7ced9476cdd7b0394432af4bfc241b960"},
"phoenix": {:hex, :phoenix, "1.4.16", "2cbbe0c81e6601567c44cc380c33aa42a1372ac1426e3de3d93ac448a7ec4308", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 1.1", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:plug, "~> 1.8.1 or ~> 1.9", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 1.0 or ~> 2.0", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "856cc1a032fa53822737413cf51aa60e750525d7ece7d1c0576d90d7c0f05c24"},
"phoenix_ecto": {:hex, :phoenix_ecto, "4.1.0", "a044d0756d0464c5a541b4a0bf4bcaf89bffcaf92468862408290682c73ae50d", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: false]}, {:phoenix_html, "~> 2.9", [hex: :phoenix_html, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "c5e666a341ff104d0399d8f0e4ff094559b2fde13a5985d4cb5023b2c2ac558b"},
"phoenix_html": {:hex, :phoenix_html, "2.14.1", "7dabafadedb552db142aacbd1f11de1c0bbaa247f90c449ca549d5e30bbc66b4", [:mix], [{:plug, "~> 1.5", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "536d5200ad37fecfe55b3241d90b7a8c3a2ca60cd012fc065f776324fa9ab0a9"},
@ -23,5 +33,7 @@
"pow": {:hex, :pow, "1.0.19", "e6295de629338661afdc52b3420f1fa37c191d246aef5d844161843fed6fe88b", [:mix], [{:ecto, "~> 2.2 or ~> 3.0", [hex: :ecto, repo: "hexpm", optional: false]}, {:phoenix, "~> 1.3.0 or ~> 1.4.0", [hex: :phoenix, repo: "hexpm", optional: false]}, {:phoenix_html, ">= 2.0.0 and <= 3.0.0", [hex: :phoenix_html, repo: "hexpm", optional: false]}, {:plug, ">= 1.5.0 and < 2.0.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "da77fab98e038b39c5360a77dc98e606cdd1446dabdd65a88991e8b23f67a356"},
"ranch": {:hex, :ranch, "1.7.1", "6b1fab51b49196860b733a49c07604465a47bdb78aa10c1c16a3d199f7f8c881", [:rebar3], [], "hexpm", "451d8527787df716d99dc36162fca05934915db0b6141bbdac2ea8d3c7afc7d7"},
"slugger": {:hex, :slugger, "0.3.0", "efc667ab99eee19a48913ccf3d038b1fb9f165fa4fbf093be898b8099e61b6ed", [:mix], [], "hexpm", "20d0ded0e712605d1eae6c5b4889581c3460d92623a930ddda91e0e609b5afba"},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.5", "6eaf7ad16cb568bb01753dbbd7a95ff8b91c7979482b95f38443fe2c8852a79b", [:make, :mix, :rebar3], [], "hexpm", "13104d7897e38ed7f044c4de953a6c28597d1c952075eb2e328bc6d6f2bfc496"},
"telemetry": {:hex, :telemetry, "0.4.1", "ae2718484892448a24470e6aa341bc847c3277bfb8d4e9289f7474d752c09c7f", [:rebar3], [], "hexpm", "4738382e36a0a9a2b6e25d67c960e40e1a2c95560b9f936d8e29de8cd858480f"},
"unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm", "1d1848c40487cdb0b30e8ed975e34e025860c02e419cb615d255849f3427439d"},
}
Loading…
Cancel
Save