PostgreSQL取代HTML实体功能
发布时间:2020-12-13 16:14:58 所属栏目:百科 来源:网络整理
导读:我找到了 this very interesting function on internet: CREATE OR REPLACE FUNCTION strip_tags(TEXT) RETURNS TEXT AS $$ SELECT regexp_replace(regexp_replace($1,E'(?x)[^]*?(s alt s* = s* (['"]) ([^]*?) 2) [^]*? ',E'3'),E'(?x)( [^]*? )','
我找到了
this very interesting function on internet:
CREATE OR REPLACE FUNCTION strip_tags(TEXT) RETURNS TEXT AS $$ SELECT regexp_replace(regexp_replace($1,E'(?x)<[^>]*?(s alt s* = s* (['"]) ([^>]*?) 2) [^>]*? >',E'3'),E'(?x)(< [^>]*? >)','','g') $$LANGUAGE SQL; 但它不会删除html代码,如:& quot; &安培; NBSP;
是的,可以使用函数将HTML或其他字符实体替换为相应的字符.首先创建一个字符实体表:
create table character_entity( name text primary key,ch char(1) unique ); insert into character_entity (ch,name) values (E'u00C6','AElig'),(E'u00C1','Aacute'),(E'u00C2','Acirc'),(E'u00C0','Agrave'),(E'u0391','Alpha'),(E'u00C5','Aring'),(E'u00C3','Atilde'),(E'u00C4','Auml'),(E'u0392','Beta'),(E'u00C7','Ccedil'),(E'u03A7','Chi'),(E'u2021','Dagger'),(E'u0394','Delta'),(E'u00D0','ETH'),(E'u00C9','Eacute'),(E'u00CA','Ecirc'),(E'u00C8','Egrave'),(E'u0395','Epsilon'),(E'u0397','Eta'),(E'u00CB','Euml'),(E'u0393','Gamma'),(E'u00CD','Iacute'),(E'u00CE','Icirc'),(E'u00CC','Igrave'),(E'u0399','Iota'),(E'u00CF','Iuml'),(E'u039A','Kappa'),(E'u039B','Lambda'),(E'u039C','Mu'),(E'u00D1','Ntilde'),(E'u039D','Nu'),(E'u0152','OElig'),(E'u00D3','Oacute'),(E'u00D4','Ocirc'),(E'u00D2','Ograve'),(E'u03A9','Omega'),(E'u039F','Omicron'),(E'u00D8','Oslash'),(E'u00D5','Otilde'),(E'u00D6','Ouml'),(E'u03A6','Phi'),(E'u03A0','Pi'),(E'u2033','Prime'),(E'u03A8','Psi'),(E'u03A1','Rho'),(E'u0160','Scaron'),(E'u03A3','Sigma'),(E'u00DE','THORN'),(E'u03A4','Tau'),(E'u0398','Theta'),(E'u00DA','Uacute'),(E'u00DB','Ucirc'),(E'u00D9','Ugrave'),(E'u03A5','Upsilon'),(E'u00DC','Uuml'),(E'u039E','Xi'),(E'u00DD','Yacute'),(E'u0178','Yuml'),(E'u0396','Zeta'),(E'u00E1','aacute'),(E'u00E2','acirc'),(E'u00B4','acute'),(E'u00E6','aelig'),(E'u00E0','agrave'),(E'u2135','alefsym'),(E'u03B1','alpha'),(E'u0026','amp'),(E'u2227','and'),(E'u2220','ang'),(E'u00E5','aring'),(E'u2248','asymp'),(E'u00E3','atilde'),(E'u00E4','auml'),(E'u201E','bdquo'),(E'u03B2','beta'),(E'u00A6','brvbar'),(E'u2022','bull'),(E'u2229','cap'),(E'u00E7','ccedil'),(E'u00B8','cedil'),(E'u00A2','cent'),(E'u03C7','chi'),(E'u02C6','circ'),(E'u2663','clubs'),(E'u2245','cong'),(E'u00A9','copy'),(E'u21B5','crarr'),(E'u222A','cup'),(E'u00A4','curren'),(E'u21D3','dArr'),(E'u2020','dagger'),(E'u2193','darr'),(E'u00B0','deg'),(E'u03B4','delta'),(E'u2666','diams'),(E'u00F7','divide'),(E'u00E9','eacute'),(E'u00EA','ecirc'),(E'u00E8','egrave'),(E'u2205','empty'),(E'u2003','emsp'),(E'u2002','ensp'),(E'u03B5','epsilon'),(E'u2261','equiv'),(E'u03B7','eta'),(E'u00F0','eth'),(E'u00EB','euml'),(E'u20AC','euro'),(E'u2203','exist'),(E'u0192','fnof'),(E'u2200','forall'),(E'u00BD','frac12'),(E'u00BC','frac14'),(E'u00BE','frac34'),(E'u2044','frasl'),(E'u03B3','gamma'),(E'u2265','ge'),(E'u003E','gt'),(E'u21D4','hArr'),(E'u2194','harr'),(E'u2665','hearts'),(E'u2026','hellip'),(E'u00ED','iacute'),(E'u00EE','icirc'),(E'u00A1','iexcl'),(E'u00EC','igrave'),(E'u2111','image'),(E'u221E','infin'),(E'u222B','int'),(E'u03B9','iota'),(E'u00BF','iquest'),(E'u2208','isin'),(E'u00EF','iuml'),(E'u03BA','kappa'),(E'u21D0','lArr'),(E'u03BB','lambda'),(E'u2329','lang'),(E'u00AB','laquo'),(E'u2190','larr'),(E'u2308','lceil'),(E'u201C','ldquo'),(E'u2264','le'),(E'u230A','lfloor'),(E'u2217','lowast'),(E'u25CA','loz'),(E'u200E','lrm'),(E'u2039','lsaquo'),(E'u2018','lsquo'),(E'u003C','lt'),(E'u00AF','macr'),(E'u2014','mdash'),(E'u00B5','micro'),(E'u00B7','middot'),(E'u2212','minus'),(E'u03BC','mu'),(E'u2207','nabla'),(E'u00A0','nbsp'),(E'u2013','ndash'),(E'u2260','ne'),(E'u220B','ni'),(E'u00AC','not'),(E'u2209','notin'),(E'u2284','nsub'),(E'u00F1','ntilde'),(E'u03BD','nu'),(E'u00F3','oacute'),(E'u00F4','ocirc'),(E'u0153','oelig'),(E'u00F2','ograve'),(E'u203E','oline'),(E'u03C9','omega'),(E'u03BF','omicron'),(E'u2295','oplus'),(E'u2228','or'),(E'u00AA','ordf'),(E'u00BA','ordm'),(E'u00F8','oslash'),(E'u00F5','otilde'),(E'u2297','otimes'),(E'u00F6','ouml'),(E'u00B6','para'),(E'u2202','part'),(E'u2030','permil'),(E'u22A5','perp'),(E'u03C6','phi'),(E'u03C0','pi'),(E'u03D6','piv'),(E'u00B1','plusmn'),(E'u00A3','pound'),(E'u2032','prime'),(E'u220F','prod'),(E'u221D','prop'),(E'u03C8','psi'),(E'u0022','quot'),(E'u21D2','rArr'),(E'u221A','radic'),(E'u232A','rang'),(E'u00BB','raquo'),(E'u2192','rarr'),(E'u2309','rceil'),(E'u201D','rdquo'),(E'u211C','real'),(E'u00AE','reg'),(E'u230B','rfloor'),(E'u03C1','rho'),(E'u200F','rlm'),(E'u203A','rsaquo'),(E'u2019','rsquo'),(E'u201A','sbquo'),(E'u0161','scaron'),(E'u22C5','sdot'),(E'u00A7','sect'),(E'u00AD','shy'),(E'u03C3','sigma'),(E'u03C2','sigmaf'),(E'u223C','sim'),(E'u2660','spades'),(E'u2282','sub'),(E'u2286','sube'),(E'u2211','sum'),(E'u2283','sup'),(E'u00B9','sup1'),(E'u00B2','sup2'),(E'u00B3','sup3'),(E'u2287','supe'),(E'u00DF','szlig'),(E'u03C4','tau'),(E'u2234','there4'),(E'u03B8','theta'),(E'u03D1','thetasym'),(E'u2009','thinsp'),(E'u00FE','thorn'),(E'u02DC','tilde'),(E'u00D7','times'),(E'u2122','trade'),(E'u21D1','uArr'),(E'u00FA','uacute'),(E'u2191','uarr'),(E'u00FB','ucirc'),(E'u00F9','ugrave'),(E'u00A8','uml'),(E'u03D2','upsih'),(E'u03C5','upsilon'),(E'u00FC','uuml'),(E'u2118','weierp'),(E'u03BE','xi'),(E'u00FD','yacute'),(E'u00A5','yen'),(E'u00FF','yuml'),(E'u03B6','zeta'),(E'u200D','zwj'),(E'u200C','zwnj') ; 这是功能: create or replace function entity2char(t text) returns text as $body$ declare r record; begin for r in select distinct ce.ch,ce.name from character_entity ce inner join ( select name[1] "name" from regexp_matches(t,'&([A-Za-z]+?);','g') r(name) ) s on ce.name = s.name loop t := replace(t,'&' || r.name || ';',r.ch); end loop; for r in select distinct hex[1] hex,('x' || repeat('0',8 - length(hex[1])) || hex[1])::bit(32)::int codepoint from regexp_matches(t,'&#x([0-9a-f]{1,8}?);','gi') s(hex) loop t := regexp_replace(t,'&#x' || r.hex || ';',chr(r.codepoint),'gi'); end loop; for r in select distinct chr(codepoint[1]::int) ch,codepoint[1] codepoint from regexp_matches(t,'&#([0-9]{1,10}?);','g') s(codepoint) loop t := replace(t,'&#' || r.codepoint || ';',r.ch); end loop; return t; end; $body$ language plpgsql immutable; 像这样用它: select entity2char('HH■XXXÆYYY×ZZZ■UUU'); entity2char -------------------- HH■XXX?YYY×ZZZ■UUU 它仅适用于UTF-8. (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |