You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

225 lines
6.0 KiB

  1. defmodule Microformats2.Items do
  2. def parse(nodes, doc, url, items \\ [])
  3. def parse([head | tail], doc, url, items) when is_bitstring(head), do: parse(tail, doc, url, items)
  4. def parse([head | tail], doc, url, items), do: parse(tail, doc, url, parse(head, doc, url, items))
  5. def parse([], _, _, items), do: items
  6. def parse(root, doc, url, items) do
  7. root_classes =
  8. Microformats2.attr_list(root)
  9. |> Enum.filter(fn class_name -> Microformats2.is_rootlevel?(class_name) end)
  10. |> Enum.sort()
  11. {_, _, children} = root
  12. if not Enum.empty?(root_classes) do
  13. entry =
  14. parse_sub(children, doc, url, %{type: root_classes, properties: %{}})
  15. |> Microformats2.Items.ImpliedProperties.parse(root, url, doc)
  16. items ++ [entry]
  17. else
  18. parse(children, doc, url, items)
  19. end
  20. end
  21. defp parse_sub([], _, _, item), do: item
  22. defp parse_sub([child | children], doc, url, item) when is_bitstring(child), do: parse_sub(children, doc, url, item)
  23. defp parse_sub([child = {_, _, child_children} | children], doc, url, item) do
  24. p =
  25. if Microformats2.has_a?(child, "h-") do
  26. parse(child, doc, url, []) |> List.first()
  27. else
  28. []
  29. end
  30. classes =
  31. Microformats2.attr_list(child)
  32. |> Enum.filter(fn
  33. "p-" <> _ -> true
  34. "u-" <> _ -> true
  35. "dt-" <> _ -> true
  36. "e-" <> _ -> true
  37. _ -> false
  38. end)
  39. props = gen_prop(child, classes, item, p, doc, url)
  40. n_item = if Microformats2.is_rootlevel?(child), do: props, else: parse_sub(child_children, doc, url, props)
  41. parse_sub(children, doc, url, n_item)
  42. end
  43. defp parse_prop("p-" <> _, child, _, _) do
  44. # TODO value pattern parsing
  45. {elem, _, _} = child
  46. title = Floki.attribute(child, "title") |> List.first()
  47. alt = Floki.attribute(child, "alt") |> List.first()
  48. cond do
  49. elem == "abbr" and not Microformats2.blank?(title) ->
  50. title
  51. elem == "img" and not Microformats2.blank?(alt) ->
  52. alt
  53. true ->
  54. text_content(child) |> String.trim()
  55. end
  56. end
  57. defp parse_prop("u-" <> _, child = {elem, _, _}, doc, url) do
  58. href = Floki.attribute(child, "href") |> List.first()
  59. src = Floki.attribute(child, "src") |> List.first()
  60. data = Floki.attribute(child, "data") |> List.first()
  61. poster = Floki.attribute(child, "poster") |> List.first()
  62. title = Floki.attribute(child, "title") |> List.first()
  63. value = Floki.attribute(child, "value") |> List.first()
  64. cond do
  65. Enum.member?(["a", "area"], elem) and not Microformats2.blank?(href) ->
  66. href
  67. Enum.member?(["img", "audio", "video", "source"], elem) and not Microformats2.blank?(src) ->
  68. src
  69. elem == "object" and not Microformats2.blank?(data) ->
  70. data
  71. elem == "video" and not Microformats2.blank?(poster) ->
  72. poster
  73. # TODO value-class-pattern at this position
  74. elem == "abbr" and not Microformats2.blank?(title) ->
  75. title
  76. Enum.member?(["data", "input"], elem) and not Microformats2.blank?(value) ->
  77. value
  78. true ->
  79. text_content(child) |> String.trim()
  80. end
  81. |> Microformats2.abs_uri(url, doc)
  82. end
  83. defp parse_prop("dt-" <> _, child = {elem, _, _}, _, _) do
  84. dt = Floki.attribute(child, "datetime")
  85. title = Floki.attribute(child, "title")
  86. value = Floki.attribute(child, "value")
  87. cond do
  88. Enum.member?(["time", "ins", "del"], elem) and not Microformats2.blank?(dt) ->
  89. dt |> List.first()
  90. elem == "abbr" and not Microformats2.blank?(title) ->
  91. title |> List.first()
  92. Enum.member?(["data", "input"], elem) and not Microformats2.blank?(value) ->
  93. value |> List.first()
  94. true ->
  95. text_content(child) |> String.trim()
  96. end
  97. end
  98. defp parse_prop("e-" <> _, child = {_, _, children}, _, _) do
  99. %{
  100. html: Microformats2.stripped_or_nil(Floki.raw_html(children)),
  101. text: Microformats2.stripped_or_nil(Floki.text(child))
  102. }
  103. end
  104. defp parse_prop(_, _, _, _), do: nil
  105. defp get_value(class, p) do
  106. cond do
  107. Microformats2.is_a?(class, "p") and p[:properties][:name] != nil ->
  108. List.first(p[:properties][:name])
  109. Microformats2.is_a?(class, "u") and p[:properties][:url] != nil ->
  110. List.first(p[:properties][:url])
  111. # and p[:properties][:url] != nil ->
  112. Microformats2.is_a?(class, "e") ->
  113. # TODO handle
  114. nil
  115. true ->
  116. # TODO handle
  117. nil
  118. end
  119. end
  120. defp gen_prop(child, classes, item, p, doc, url) do
  121. props =
  122. Enum.reduce(classes, item[:properties], fn class, acc ->
  123. prop =
  124. if Microformats2.is_rootlevel?(child) do
  125. Map.put(p, :value, get_value(class, p))
  126. else
  127. parse_prop(class, child, doc, url)
  128. end
  129. key = strip_prefix(class) |> to_key |> String.to_atom()
  130. val = if acc[key] != nil, do: acc[key], else: []
  131. Map.put(acc, key, val ++ [prop])
  132. end)
  133. if Microformats2.blank?(classes) and not Microformats2.blank?(p) and Microformats2.is_rootlevel?(child) do
  134. Map.put(item, :children, (item[:children] || []) ++ [p])
  135. else
  136. Map.put(item, :properties, props)
  137. end
  138. end
  139. defp strip_prefix("p-" <> rest) do
  140. rest
  141. end
  142. defp strip_prefix("u-" <> rest) do
  143. rest
  144. end
  145. defp strip_prefix("dt-" <> rest) do
  146. rest
  147. end
  148. defp strip_prefix("e-" <> rest) do
  149. rest
  150. end
  151. defp strip_prefix(rest) do
  152. rest
  153. end
  154. def text_content(child, text \\ "")
  155. def text_content(child = {elem, _, children}, text) do
  156. txt =
  157. if elem == "img" do
  158. alt = Floki.attribute(child, "alt")
  159. if alt != nil and alt != "" do
  160. alt
  161. else
  162. Floki.attribute(child, "src")
  163. end
  164. |> List.first()
  165. else
  166. ""
  167. end
  168. Enum.reduce(children, text <> txt, fn child, acc ->
  169. text_content(child, acc)
  170. end)
  171. end
  172. def text_content(child, text) when is_bitstring(child) do
  173. text <> child
  174. end
  175. defp to_key(str) do
  176. String.replace(str, ~r/[-]/, "_")
  177. end
  178. end