Add a script for generating coop data from coops.tech mirror

This commit is contained in:
Chris Lowis 2018-11-09 16:02:15 +00:00
parent 40f7f42ec2
commit 5ca9d98343
6 changed files with 208 additions and 0 deletions

1
scripts/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
mirror

2
scripts/Gemfile Normal file
View File

@ -0,0 +1,2 @@
gem 'nokogiri'
gem 'activesupport'

27
scripts/Gemfile.lock Normal file
View File

@ -0,0 +1,27 @@
GEM
specs:
activesupport (5.1.4)
concurrent-ruby (~> 1.0, >= 1.0.2)
i18n (~> 0.7)
minitest (~> 5.1)
tzinfo (~> 1.1)
concurrent-ruby (1.1.1)
i18n (0.9.5)
concurrent-ruby (~> 1.0)
mini_portile2 (2.3.0)
minitest (5.11.3)
nokogiri (1.8.5)
mini_portile2 (~> 2.3.0)
thread_safe (0.3.6)
tzinfo (1.2.5)
thread_safe (~> 0.1)
PLATFORMS
ruby
DEPENDENCIES
activesupport
nokogiri
BUNDLED WITH
1.17.1

19
scripts/README.md Normal file
View File

@ -0,0 +1,19 @@
# Scripts
This directory contains scripts for migrating data from wordpress to jekyll.
It relies on a local mirror of the [archived CoTech website](https://coops.tech.archived.website/) which you can create by running
mkdir mirror
cd mirror
httrack https://coops.tech.archived.website/
`httrack` is available via `homebrew` on MacOS.
You then need to install dependencies from rubygems by running
bundle install
## Creating data files in `_coops`
ruby migrate_coop_pages.rb

26
scripts/coop.erb.md Normal file
View File

@ -0,0 +1,26 @@
---
title: <%= title %>
name: <%= name %>
website: <%= website %>
email: <%= email %>
twitter: <%= twitter %>
github: <%= github %>
telephone: <%= telephone %>
address: <%= address %>
latitude: <%= latitude %>
longitude: <%= longitude %>
clients:
<% clients.each do |client| %>
- <%= client %>
<% end %>
services:
<% services.each do |service| %>
- <%= service %>
<% end %>
technologies:
<% technologies.each do |technology| %>
- <%= technology %>
<% end %>
---
<%= body %>

View File

@ -0,0 +1,133 @@
require 'nokogiri'
require 'active_support/inflector'
class Coop
attr_reader :doc
def initialize(html)
@doc = Nokogiri::HTML(html)
end
def name
doc.xpath('//*[@id="page-banner"]/div/div/h2').text
end
alias title name
def website
doc.xpath('//*[@id="page-banner"]/div/div/a[2]').text.strip
end
def email
doc.xpath('/html/body/div/div[2]/div/div/div[1]/section/div[2]/p/a').text.strip
end
def twitter
url = doc.xpath('/html/body/div/div[2]/div/div/div[1]/section/div[1]/ul/li[2]/a/@href').text
URI.parse(url).path.split('/').last
end
def github
url = doc.xpath('/html/body/div/div[2]/div/div/div[1]/section/div[1]/ul/li[3]/a/@href').text
URI.parse(url).path.split('/').last
end
def telephone
doc.xpath('/html/body/div/div[2]/div/div/div[1]/section/div[3]/p').text
end
def address
doc.xpath('/html/body/div/div[2]/div/div/div[1]/section/div[4]/p').text
end
def latitude
script = doc.xpath('/html/body/div/section/script').text
match = /var latitude = '(.+)';/.match(script)
match[1] if match
end
def longitude
script = doc.xpath('/html/body/div/section/script').text
match = /var longitude = '(.+)';/.match(script)
match[1] if match
end
def services
doc.css('a.service-thumb').map do |node|
url = node.xpath('@href').text
url.split('/')[2]
end
end
def clients
doc.css('div.client-thumb-container').map do |node|
node.xpath('h5').text&.parameterize
end
end
def technologies
doc.css('a.technology-thumb').map do |node|
url = node.xpath('@href').text
url.split('/')[2]
end
end
def body
doc.xpath('/html/body/div/div[2]/div/div/div[2]/section[1]').text.strip
end
def erb_binding
binding
end
end
source_pages = %w(
agile-collective.html
alpha-communication.html
animorph.html
aptivate.html
autonomic.html
blake-house-filmmakers-co-op.html
calverts.html
cbn.html
cetis-llp.html
chapel-street-studio.html
co-operative-web.html
creative-coop.html
dev-the-developers-society.html
digital-liberties.html
dtc-innovation.html
fairmondo-uk.html
founders-and-coders.html
gildedsplinters.html
glowbox-design.html
go-free-range.html
graphics-coop.html
mc3.html
media-coop.html
mediablaze-hosts.html
netuxo.html
open-data-services.html
open-ecommerce.html
outlandish.html
secure-active-c-i-c.html
small-axe.html
tableflip.html
the-dot-project.html
wave.html
we-are-open.html
webarchitects.html
)
require 'erb'
source_pages.each do |page|
fn = File.join(File.dirname(__FILE__), 'mirror', 'coops.tech.archived.website', 'co-op', page)
output_fn = File.join(File.dirname(__FILE__), '..', '_coops', page.gsub('.html','.md'))
coop = Coop.new(File.read(fn))
renderer = ERB.new(File.read('coop.erb.md'), nil, '<>')
result = renderer.result(coop.erb_binding)
File.open(output_fn, 'w') { |file| file.write(result) }
end