You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

212 lines
5.6 KiB

  1. defmodule Microformats2.Items do
  2. import Microformats2.Helpers
  3. alias Microformats2.Items.ImpliedProperties
  4. def parse(nodes, doc, url, items \\ [])
  5. def parse([head | tail], doc, url, items) when is_bitstring(head), do: parse(tail, doc, url, items)
  6. def parse([head | tail], doc, url, items), do: parse(tail, doc, url, parse(head, doc, url, items))
  7. def parse([], _, _, items), do: items
  8. def parse(root, doc, url, items) do
  9. root_classes =
  10. attr_list(root)
  11. |> Enum.filter(&is_rootlevel?/1)
  12. |> Enum.sort()
  13. {_, _, children} = root
  14. if not Enum.empty?(root_classes) do
  15. entry =
  16. parse_sub(children, doc, url, %{type: root_classes, properties: %{}})
  17. |> ImpliedProperties.parse(root, url, doc)
  18. items ++ [entry]
  19. else
  20. parse(children, doc, url, items)
  21. end
  22. end
  23. defp parse_sub([], _, _, item), do: item
  24. defp parse_sub([child | children], doc, url, item) when is_bitstring(child), do: parse_sub(children, doc, url, item)
  25. defp parse_sub([child = {_, _, child_children} | children], doc, url, item) do
  26. p =
  27. if has_a?(child, "h-") do
  28. parse(child, doc, url, []) |> List.first()
  29. else
  30. []
  31. end
  32. classes =
  33. attr_list(child)
  34. |> Enum.filter(fn
  35. "p-" <> _ -> true
  36. "u-" <> _ -> true
  37. "dt-" <> _ -> true
  38. "e-" <> _ -> true
  39. _ -> false
  40. end)
  41. props = gen_prop(child, classes, item, p, doc, url)
  42. n_item =
  43. if is_rootlevel?(child),
  44. do: props,
  45. else: parse_sub(child_children, doc, url, props)
  46. parse_sub(children, doc, url, n_item)
  47. end
  48. defp parse_prop("p-" <> _, child, _, _) do
  49. # TODO value pattern parsing
  50. {elem, _, _} = child
  51. title = Floki.attribute(child, "title") |> List.first()
  52. alt = Floki.attribute(child, "alt") |> List.first()
  53. cond do
  54. elem == "abbr" and present?(title) ->
  55. title
  56. elem == "img" and present?(alt) ->
  57. alt
  58. true ->
  59. text_content(child) |> String.trim()
  60. end
  61. end
  62. defp parse_prop("u-" <> _, child = {elem, _, _}, doc, url) do
  63. href = Floki.attribute(child, "href") |> List.first()
  64. src = Floki.attribute(child, "src") |> List.first()
  65. data = Floki.attribute(child, "data") |> List.first()
  66. poster = Floki.attribute(child, "poster") |> List.first()
  67. title = Floki.attribute(child, "title") |> List.first()
  68. value = Floki.attribute(child, "value") |> List.first()
  69. cond do
  70. Enum.member?(["a", "area"], elem) and present?(href) ->
  71. href
  72. Enum.member?(["img", "audio", "video", "source"], elem) and present?(src) ->
  73. src
  74. elem == "object" and present?(data) ->
  75. data
  76. elem == "video" and present?(poster) ->
  77. poster
  78. # TODO value-class-pattern at this position
  79. elem == "abbr" and present?(title) ->
  80. title
  81. Enum.member?(["data", "input"], elem) and present?(value) ->
  82. value
  83. true ->
  84. text_content(child) |> String.trim()
  85. end
  86. |> abs_uri(url, doc)
  87. end
  88. defp parse_prop("dt-" <> _, child = {elem, _, _}, _, _) do
  89. dt = Floki.attribute(child, "datetime")
  90. title = Floki.attribute(child, "title")
  91. value = Floki.attribute(child, "value")
  92. cond do
  93. Enum.member?(["time", "ins", "del"], elem) and present?(dt) ->
  94. dt |> List.first()
  95. elem == "abbr" and present?(title) ->
  96. title |> List.first()
  97. Enum.member?(["data", "input"], elem) and present?(value) ->
  98. value |> List.first()
  99. true ->
  100. text_content(child) |> String.trim()
  101. end
  102. end
  103. defp parse_prop("e-" <> _, child = {_, _, children}, _, _) do
  104. %{
  105. html: stripped_or_nil(Floki.raw_html(children)),
  106. text: stripped_or_nil(Floki.text(child))
  107. }
  108. end
  109. defp parse_prop(_, _, _, _), do: nil
  110. defp get_value(class, p) do
  111. name_key = normalized_key("name")
  112. url_key = normalized_key("url")
  113. cond do
  114. is_a?(class, "p") and p[:properties][name_key] != nil ->
  115. List.first(p[:properties][name_key])
  116. is_a?(class, "u") and p[:properties][url_key] != nil ->
  117. List.first(p[:properties][url_key])
  118. # and p[:properties][url_key] != nil ->
  119. is_a?(class, "e") ->
  120. # TODO handle
  121. nil
  122. true ->
  123. # TODO handle
  124. nil
  125. end
  126. end
  127. defp gen_prop(child, classes, item, p, doc, url) do
  128. props =
  129. Enum.reduce(classes, item[:properties], fn class, acc ->
  130. prop =
  131. if is_rootlevel?(child) do
  132. Map.put(p, :value, get_value(class, p))
  133. else
  134. parse_prop(class, child, doc, url)
  135. end
  136. key = strip_prefix(class) |> to_key |> normalized_key()
  137. val = if acc[key] != nil, do: acc[key], else: []
  138. Map.put(acc, key, val ++ [prop])
  139. end)
  140. if blank?(classes) and present?(p) and is_rootlevel?(child),
  141. do: Map.update(item, :children, [p], &(&1 ++ [p])),
  142. else: Map.put(item, :properties, props)
  143. end
  144. defp strip_prefix("p-" <> rest), do: rest
  145. defp strip_prefix("u-" <> rest), do: rest
  146. defp strip_prefix("dt-" <> rest), do: rest
  147. defp strip_prefix("e-" <> rest), do: rest
  148. defp strip_prefix(rest), do: rest
  149. def text_content(child, text \\ "")
  150. def text_content(child = {elem, _, children}, text) do
  151. txt =
  152. if elem == "img" do
  153. alt = Floki.attribute(child, "alt")
  154. if !blank?(alt) do
  155. alt
  156. else
  157. Floki.attribute(child, "src")
  158. end
  159. |> List.first()
  160. else
  161. ""
  162. end
  163. Enum.reduce(children, text <> txt, &text_content/2)
  164. end
  165. def text_content(child, text) when is_bitstring(child), do: text <> child
  166. end