From 5ca9d983435caad715c65f998e17fb6246c2c7e0 Mon Sep 17 00:00:00 2001 From: Chris Lowis Date: Fri, 9 Nov 2018 16:02:15 +0000 Subject: [PATCH] Add a script for generating coop data from coops.tech mirror --- scripts/.gitignore | 1 + scripts/Gemfile | 2 + scripts/Gemfile.lock | 27 +++++++ scripts/README.md | 19 +++++ scripts/coop.erb.md | 26 +++++++ scripts/migrate_coop_pages.rb | 133 ++++++++++++++++++++++++++++++++++ 6 files changed, 208 insertions(+) create mode 100644 scripts/.gitignore create mode 100644 scripts/Gemfile create mode 100644 scripts/Gemfile.lock create mode 100644 scripts/README.md create mode 100644 scripts/coop.erb.md create mode 100644 scripts/migrate_coop_pages.rb diff --git a/scripts/.gitignore b/scripts/.gitignore new file mode 100644 index 0000000..1f89de9 --- /dev/null +++ b/scripts/.gitignore @@ -0,0 +1 @@ +mirror \ No newline at end of file diff --git a/scripts/Gemfile b/scripts/Gemfile new file mode 100644 index 0000000..5993cff --- /dev/null +++ b/scripts/Gemfile @@ -0,0 +1,2 @@ +gem 'nokogiri' +gem 'activesupport' diff --git a/scripts/Gemfile.lock b/scripts/Gemfile.lock new file mode 100644 index 0000000..3b15fe6 --- /dev/null +++ b/scripts/Gemfile.lock @@ -0,0 +1,27 @@ +GEM + specs: + activesupport (5.1.4) + concurrent-ruby (~> 1.0, >= 1.0.2) + i18n (~> 0.7) + minitest (~> 5.1) + tzinfo (~> 1.1) + concurrent-ruby (1.1.1) + i18n (0.9.5) + concurrent-ruby (~> 1.0) + mini_portile2 (2.3.0) + minitest (5.11.3) + nokogiri (1.8.5) + mini_portile2 (~> 2.3.0) + thread_safe (0.3.6) + tzinfo (1.2.5) + thread_safe (~> 0.1) + +PLATFORMS + ruby + +DEPENDENCIES + activesupport + nokogiri + +BUNDLED WITH + 1.17.1 diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..231fb23 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,19 @@ +# Scripts + +This directory contains scripts for migrating data from wordpress to jekyll. + +It relies on a local mirror of the [archived CoTech website](https://coops.tech.archived.website/) which you can create by running + + mkdir mirror + cd mirror + httrack https://coops.tech.archived.website/ + +`httrack` is available via `homebrew` on MacOS. + +You then need to install dependencies from rubygems by running + + bundle install + +## Creating data files in `_coops` + + ruby migrate_coop_pages.rb diff --git a/scripts/coop.erb.md b/scripts/coop.erb.md new file mode 100644 index 0000000..e970de7 --- /dev/null +++ b/scripts/coop.erb.md @@ -0,0 +1,26 @@ +--- +title: <%= title %> +name: <%= name %> +website: <%= website %> +email: <%= email %> +twitter: <%= twitter %> +github: <%= github %> +telephone: <%= telephone %> +address: <%= address %> +latitude: <%= latitude %> +longitude: <%= longitude %> +clients: +<% clients.each do |client| %> +- <%= client %> +<% end %> +services: +<% services.each do |service| %> +- <%= service %> +<% end %> +technologies: +<% technologies.each do |technology| %> +- <%= technology %> +<% end %> +--- + +<%= body %> diff --git a/scripts/migrate_coop_pages.rb b/scripts/migrate_coop_pages.rb new file mode 100644 index 0000000..6c6436c --- /dev/null +++ b/scripts/migrate_coop_pages.rb @@ -0,0 +1,133 @@ +require 'nokogiri' +require 'active_support/inflector' + +class Coop + attr_reader :doc + + def initialize(html) + @doc = Nokogiri::HTML(html) + end + + def name + doc.xpath('//*[@id="page-banner"]/div/div/h2').text + end + + alias title name + + def website + doc.xpath('//*[@id="page-banner"]/div/div/a[2]').text.strip + end + + def email + doc.xpath('/html/body/div/div[2]/div/div/div[1]/section/div[2]/p/a').text.strip + end + + def twitter + url = doc.xpath('/html/body/div/div[2]/div/div/div[1]/section/div[1]/ul/li[2]/a/@href').text + URI.parse(url).path.split('/').last + end + + def github + url = doc.xpath('/html/body/div/div[2]/div/div/div[1]/section/div[1]/ul/li[3]/a/@href').text + URI.parse(url).path.split('/').last + end + + def telephone + doc.xpath('/html/body/div/div[2]/div/div/div[1]/section/div[3]/p').text + end + + def address + doc.xpath('/html/body/div/div[2]/div/div/div[1]/section/div[4]/p').text + end + + def latitude + script = doc.xpath('/html/body/div/section/script').text + match = /var latitude = '(.+)';/.match(script) + match[1] if match + end + + def longitude + script = doc.xpath('/html/body/div/section/script').text + match = /var longitude = '(.+)';/.match(script) + match[1] if match + end + + def services + doc.css('a.service-thumb').map do |node| + url = node.xpath('@href').text + url.split('/')[2] + end + end + + def clients + doc.css('div.client-thumb-container').map do |node| + node.xpath('h5').text&.parameterize + end + end + + def technologies + doc.css('a.technology-thumb').map do |node| + url = node.xpath('@href').text + url.split('/')[2] + end + end + + def body + doc.xpath('/html/body/div/div[2]/div/div/div[2]/section[1]').text.strip + end + + def erb_binding + binding + end +end + +source_pages = %w( + agile-collective.html + alpha-communication.html + animorph.html + aptivate.html + autonomic.html + blake-house-filmmakers-co-op.html + calverts.html + cbn.html + cetis-llp.html + chapel-street-studio.html + co-operative-web.html + creative-coop.html + dev-the-developers-society.html + digital-liberties.html + dtc-innovation.html + fairmondo-uk.html + founders-and-coders.html + gildedsplinters.html + glowbox-design.html + go-free-range.html + graphics-coop.html + mc3.html + media-coop.html + mediablaze-hosts.html + netuxo.html + open-data-services.html + open-ecommerce.html + outlandish.html + secure-active-c-i-c.html + small-axe.html + tableflip.html + the-dot-project.html + wave.html + we-are-open.html + webarchitects.html +) + +require 'erb' + +source_pages.each do |page| + fn = File.join(File.dirname(__FILE__), 'mirror', 'coops.tech.archived.website', 'co-op', page) + output_fn = File.join(File.dirname(__FILE__), '..', '_coops', page.gsub('.html','.md')) + + coop = Coop.new(File.read(fn)) + + renderer = ERB.new(File.read('coop.erb.md'), nil, '<>') + result = renderer.result(coop.erb_binding) + File.open(output_fn, 'w') { |file| file.write(result) } +end