码迷,mamicode.com
首页 > 其他好文 > 详细

温故而知新 原来 cheerio 还可以操作XML

时间:2019-01-06 12:48:14      阅读:192      评论:0      收藏:0      [点我收藏+]

标签:format   adf   har   att   common   alter   load   操作   lang   

以前只以为cheerio是nodejs版本的jQuery,可以轻松操作提取HTML,没想到原来还可以操作XML,记录一下。

 

示例XML: pg132.rdf

技术分享图片
<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF xml:base="http://www.gutenberg.org/"
  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  xmlns:marcrel="http://id.loc.gov/vocabulary/relators/"
  xmlns:dcam="http://purl.org/dc/dcam/"
  xmlns:dcterms="http://purl.org/dc/terms/"
  xmlns:pgterms="http://www.gutenberg.org/2009/pgterms/"
  xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
  xmlns:cc="http://web.resource.org/cc/"
>
  <pgterms:ebook rdf:about="ebooks/132">
    <dcterms:rights>Public domain in the USA.</dcterms:rights>
    <dcterms:subject>
      <rdf:Description rdf:nodeID="N21e9654e1eff482b944fc0247ef57312">
        <rdf:value>Military art and science -- Early works to 1800</rdf:value>
        <dcam:memberOf rdf:resource="http://purl.org/dc/terms/LCSH"/>
      </rdf:Description>
    </dcterms:subject>
    <dcterms:hasFormat>
      <pgterms:file rdf:about="http://www.gutenberg.org/files/132/132.zip">
        <dcterms:format>
          <rdf:Description rdf:nodeID="N2cf8cc38bef142ed8f947a48da48a14d">
            <dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
            <rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/plain; charset=us-ascii</rdf:value>
          </rdf:Description>
        </dcterms:format>
        <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2013-02-11T14:23:32</dcterms:modified>
        <dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">130802</dcterms:extent>
        <dcterms:format>
          <rdf:Description rdf:nodeID="Nb033c2142f6e4a439dd01d29c001a655">
            <dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
            <rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/zip</rdf:value>
          </rdf:Description>
        </dcterms:format>
        <dcterms:isFormatOf rdf:resource="ebooks/132"/>
      </pgterms:file>
    </dcterms:hasFormat>
    <dcterms:subject>
      <rdf:Description rdf:nodeID="N44bc14774a114aeabadc0283f87b4e56">
        <dcam:memberOf rdf:resource="http://purl.org/dc/terms/LCC"/>
        <rdf:value>U</rdf:value>
      </rdf:Description>
    </dcterms:subject>
    <pgterms:downloads rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">1656</pgterms:downloads>
    <dcterms:license rdf:resource="license"/>
    <dcterms:hasFormat>
      <pgterms:file rdf:about="http://www.gutenberg.org/ebooks/132.epub.noimages">
        <dcterms:isFormatOf rdf:resource="ebooks/132"/>
        <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-11-01T01:14:54.714978</dcterms:modified>
        <dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">147635</dcterms:extent>
        <dcterms:format>
          <rdf:Description rdf:nodeID="Nacb3971206f0478898a8ffdb4b947876">
            <dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
            <rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/epub+zip</rdf:value>
          </rdf:Description>
        </dcterms:format>
      </pgterms:file>
    </dcterms:hasFormat>
    <dcterms:description>See Project Gutenberg‘s eBook #17405 for this same text without the translator‘s annotations, and an HTML version.&#13;
See also Wikipedia: &#13;
http://en.wikipedia.org/wiki/The_Art_of_War</dcterms:description>
    <dcterms:hasFormat>
      <pgterms:file rdf:about="http://www.gutenberg.org/ebooks/132.kindle.noimages">
        <dcterms:format>
          <rdf:Description rdf:nodeID="N0e3bff89e67d42069145c8d78fe08f71">
            <rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/x-mobipocket-ebook</rdf:value>
            <dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
          </rdf:Description>
        </dcterms:format>
        <dcterms:isFormatOf rdf:resource="ebooks/132"/>
        <dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">598683</dcterms:extent>
        <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-11-01T01:14:57.358911</dcterms:modified>
      </pgterms:file>
    </dcterms:hasFormat>
    <dcterms:creator>
      <pgterms:agent rdf:about="2009/agents/4349">
        <pgterms:webpage rdf:resource="http://en.wikipedia.org/wiki/Sun_Tzu"/>
        <pgterms:alias>Sun Tzu</pgterms:alias>
        <pgterms:name>Sunzi, active 6th century B.C.</pgterms:name>
        <pgterms:alias>孫子</pgterms:alias>
        <pgterms:webpage rdf:resource="http://zh.wikipedia.org/wiki/%E5%AD%99%E6%AD%A6"/>
        <pgterms:alias>孙子</pgterms:alias>
      </pgterms:agent>
    </dcterms:creator>
    <dcterms:hasFormat>
      <pgterms:file rdf:about="http://www.gutenberg.org/ebooks/132.html.images">
        <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-11-01T01:14:54.344995</dcterms:modified>
        <dcterms:format>
          <rdf:Description rdf:nodeID="N8a514703092046af862ce250b39eeb60">
            <dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
            <rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/html</rdf:value>
          </rdf:Description>
        </dcterms:format>
        <dcterms:isFormatOf rdf:resource="ebooks/132"/>
        <dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">375823</dcterms:extent>
      </pgterms:file>
    </dcterms:hasFormat>
    <dcterms:hasFormat>
      <pgterms:file rdf:about="http://www.gutenberg.org/ebooks/132.kindle.images">
        <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-11-01T01:14:56.056948</dcterms:modified>
        <dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">598690</dcterms:extent>
        <dcterms:format>
          <rdf:Description rdf:nodeID="Nfebafc55fb90431ca4db3ffd4730e18a">
            <dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
            <rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/x-mobipocket-ebook</rdf:value>
          </rdf:Description>
        </dcterms:format>
        <dcterms:isFormatOf rdf:resource="ebooks/132"/>
      </pgterms:file>
    </dcterms:hasFormat>
    <dcterms:publisher>Project Gutenberg</dcterms:publisher>
    <dcterms:title>The Art of War</dcterms:title>
    <dcterms:hasFormat>
      <pgterms:file rdf:about="http://www.gutenberg.org/ebooks/132.txt.utf-8">
        <dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">343691</dcterms:extent>
        <dcterms:isFormatOf rdf:resource="ebooks/132"/>
        <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-11-01T01:14:54.006974</dcterms:modified>
        <dcterms:format>
          <rdf:Description rdf:nodeID="N7712a1bfd70b48a8bee0d53b765004b3">
            <dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
            <rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/plain</rdf:value>
          </rdf:Description>
        </dcterms:format>
      </pgterms:file>
    </dcterms:hasFormat>
    <dcterms:subject>
      <rdf:Description rdf:nodeID="N865943d48a634d6daad985acf75df370">
        <rdf:value>War -- Early works to 1800</rdf:value>
        <dcam:memberOf rdf:resource="http://purl.org/dc/terms/LCSH"/>
      </rdf:Description>
    </dcterms:subject>
    <pgterms:marc260>1910</pgterms:marc260>
    <dcterms:hasFormat>
      <pgterms:file rdf:about="http://www.gutenberg.org/ebooks/132.epub.images">
        <dcterms:format>
          <rdf:Description rdf:nodeID="Nd0fe36a450c04c35a9e5f11e267bb999">
            <rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/epub+zip</rdf:value>
            <dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
          </rdf:Description>
        </dcterms:format>
        <dcterms:isFormatOf rdf:resource="ebooks/132"/>
        <dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">147634</dcterms:extent>
        <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-11-01T01:14:54.597966</dcterms:modified>
      </pgterms:file>
    </dcterms:hasFormat>
    <dcterms:language>
      <rdf:Description rdf:nodeID="Nf6211a7cd8894a9f8a969f1ac52a6f97">
        <rdf:value rdf:datatype="http://purl.org/dc/terms/RFC4646">en</rdf:value>
      </rdf:Description>
    </dcterms:language>
    <dcterms:hasFormat>
      <pgterms:file rdf:about="http://www.gutenberg.org/files/132/132.txt">
        <dcterms:isFormatOf rdf:resource="ebooks/132"/>
        <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2013-02-11T14:22:50</dcterms:modified>
        <dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">343688</dcterms:extent>
        <dcterms:format>
          <rdf:Description rdf:nodeID="N94c59bae51014e438a7f67370c6932d5">
            <rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/plain; charset=us-ascii</rdf:value>
            <dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
          </rdf:Description>
        </dcterms:format>
      </pgterms:file>
    </dcterms:hasFormat>
    <dcterms:type>
      <rdf:Description rdf:nodeID="Nea290286b7e448b39f448f69cc984d4b">
        <dcam:memberOf rdf:resource="http://purl.org/dc/terms/DCMIType"/>
        <rdf:value>Text</rdf:value>
      </rdf:Description>
    </dcterms:type>
    <marcrel:trl>
      <pgterms:agent rdf:about="2009/agents/5101">
        <pgterms:name>Giles, Lionel</pgterms:name>
        <pgterms:deathdate rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">1958</pgterms:deathdate>
        <pgterms:birthdate rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">1875</pgterms:birthdate>
        <pgterms:webpage rdf:resource="http://en.wikipedia.org/wiki/Lionel_Giles"/>
      </pgterms:agent>
    </marcrel:trl>
    <dcterms:alternative>Sunzi bing fa. English</dcterms:alternative>
    <dcterms:hasFormat>
      <pgterms:file rdf:about="http://www.gutenberg.org/ebooks/132.rdf">
        <dcterms:isFormatOf rdf:resource="ebooks/132"/>
        <dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">12393</dcterms:extent>
        <dcterms:format>
          <rdf:Description rdf:nodeID="N743d14bfd57f4f96b8f4a6fd4917c2eb">
            <dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
            <rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/rdf+xml</rdf:value>
          </rdf:Description>
        </dcterms:format>
        <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-12-27T04:59:45.761871</dcterms:modified>
      </pgterms:file>
    </dcterms:hasFormat>
    <dcterms:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">1994-05-01</dcterms:issued>
    <dcterms:hasFormat>
      <pgterms:file rdf:about="http://www.gutenberg.org/ebooks/132.html.noimages">
        <dcterms:isFormatOf rdf:resource="ebooks/132"/>
        <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-11-01T01:14:54.423973</dcterms:modified>
        <dcterms:format>
          <rdf:Description rdf:nodeID="Nc300c5ba0de44732a56d056f32802d69">
            <dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
            <rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/html</rdf:value>
          </rdf:Description>
        </dcterms:format>
        <dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">375823</dcterms:extent>
      </pgterms:file>
    </dcterms:hasFormat>
    <pgterms:marc546>Translated from the Chinese by Lionel Giles, M.A. (1910)</pgterms:marc546>
  </pgterms:ebook>
  <cc:Work rdf:about="">
    <rdfs:comment>Archives containing the RDF files for *all* our books can be downloaded at
            http://www.gutenberg.org/wiki/Gutenberg:Feeds#The_Complete_Project_Gutenberg_Catalog</rdfs:comment>
    <cc:license rdf:resource="https://creativecommons.org/publicdomain/zero/1.0/"/>
  </cc:Work>
  <rdf:Description rdf:about="http://en.wikipedia.org/wiki/Lionel_Giles">
    <dcterms:description>en.wikipedia</dcterms:description>
  </rdf:Description>
  <rdf:Description rdf:about="http://zh.wikipedia.org/wiki/%E5%AD%99%E6%AD%A6">
    <dcterms:description>zh.wikipedia</dcterms:description>
  </rdf:Description>
  <rdf:Description rdf:about="http://en.wikipedia.org/wiki/Sun_Tzu">
    <dcterms:description>en.wikipedia</dcterms:description>
  </rdf:Description>
</rdf:RDF>
View Code

 

index.js

‘use strict‘;

const cheerio = require(‘cheerio‘);
const fs = require(‘fs‘);
const rdf = fs.readFileSync(`${__dirname}/pg132.rdf`);

const $ = cheerio.load(rdf)
const book = {};
// 找到 <pgterms:ebook rdf:about="ebooks/132"> 的132.(并且转化为int类型)
book.id = +$(‘pgterms\\:ebook‘).attr(‘rdf:about‘).replace(‘ebooks/‘, ‘‘)
book.title = $(‘dcterms\\:title‘).text()

 

温故而知新 原来 cheerio 还可以操作XML

标签:format   adf   har   att   common   alter   load   操作   lang   

原文地址:https://www.cnblogs.com/CyLee/p/10227864.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!