출처1 : http://acidluna.tistory.com/224

출처2 : http://blog.naver.com/csaiur/10083194327


참고할 만한 사이트.......??

https://beomy.tistory.com/21



2014/11/07 - [프로그램 자료/Java Script] - 정규식을 이용한 HTML Tag 삭제


2015/04/09 - [프로그램 자료/Visual C#] - 여러줄을 한번에 StringBuilder로 감싸는 방법~! - Editplus 정규표현식, 바꾸기 기능


2015/02/02 - [프로그램 자료/ASP.NET] - asp.net 에서 URL Query 가져오고, 값 변경 하기(정규식 사용) parameter




이 정규식을 맨 처음 만드신 분 같은데, 

깃허브에 소스가 있네요???


nuget도 지원하구요..


https://github.com/mganss/HtmlSanitizer




누군가 또 java용으로 만들어 둔 것도 있어요...

다만 답변에도 있듯이 미리 삭제할 태그들을 준비해두어야 합니다.

https://stackoverflow.com/questions/4075742/regex-to-strip-html-tags#answer-20951227


form, FORM, ForM 등 대소문자 이상하게 쓰는 패턴때문에 약간 변경했다

attributes는 내가 추가하긴 했는데 정상동작하는지는 확인하고 쓰시길 바란다;





JDK 1.8

public class HtmlSanitizer {
    private static String tagsPattern;
    private static String attrsPattern;
    private final static String[] tagsTab = { "form""script""body""iframe""object" };
    private final static String[] attrsTab = { "FSCommand""onAbort""onActivate""onAfterPrint"
            , "onAfterUpdate""onBeforeActivate""onBeforeCopy""onBeforeCut""onBeforeDeactivate",
            "onBeforeEditFocus""onBeforePaste""onBeforePrint""onBeforeUnload""onBeforeUpdate"
            , "onBegin""onBlur""onBounce""onCellChange""onChange""onClick""onContextMenu",
            "onControlSelect""onCopy""onCut""onDataAvailable""onDataSetChanged""onDataSetComplete"
            , "onDblClick""onDeactivate""onDrag""onDragEnd""onDragLeave""onDragEnter",
            "onDragOver""onDragDrop""onDragStart""onDrop""onEnd""onError""onErrorUpdate"
            , "onFilterChange""onFinish""onFocus""onFocusIn""onFocusOut""onHashChange""onHelp",
            "onInput""onKeyDown""onKeyPress""onKeyUp""onLayoutComplete""onLoad""onLoseCapture"
            , "onMediaComplete""onMediaError""onMessage""onMouseDown""onMouseEnter",
            "onMouseLeave""onMouseMove""onMouseOut""onMouseOver""onMouseUp""onMouseWheel""onMove"
            , "onMoveEnd""onMoveStart""onOffline""onOnline""onOutOfSync""onPaste""onPause",
            "onPopState""onProgress""onPropertyChange""onReadyStateChange""onRedo""onRepeat"
            , "onReset""onResize""onResizeEnd""onResizeStart""onResume""onReverse""onRowsEnter",
            "onRowExit""onRowDelete""onRowInserted""onScroll""onSeek""onSelect""onSelectionChange"
            , "onSelectStart""onStart""onStop""onStorage""onSyncRestored""onSubmit",
            "onTimeError""onTrackChange""onUndo""onUnload""onURLFlip""seekSegmentTime""href" };

    static {
        StringBuffer tags = new StringBuffer();
        for (int i = 0; i < tagsTab.length; i++) {
            tags.append(tagsTab[i].toLowerCase());
            if (i < tagsTab.length - 1) {
                tags.append('|');
            }
        }
        tagsPattern = "(?i)</?(" + tags.toString() + "){1}.*?/?>";
    }

    static {
        StringBuffer attrs = new StringBuffer();
        for (int i = 0; i < attrsTab.length; i++) {
            attrs.append(attrsTab[i].toLowerCase());
            if (i < attrsTab.length - 1) {
                attrs.append('|');
            }
        }
        attrsPattern = "(?i)\\s(?:" + attrs.toString() + ")\\s*=\\s*(([\\\"'\\s]?)[^>]*)\\2";
    }

    public static String sanitize(String input) {
        String rtn = input.replaceAll(tagsPattern, "");
        rtn = rtn.replaceAll(attrsPattern, "");
        return rtn;
    }
}









HTMLTags.cs






using System;

using System.Collections.Generic;

using System.Linq;

using System.Text;

using System.Text.RegularExpressions;

 

 

/*

 * 출처 :

 * http://acidluna.tistory.com/224

 * http://blog.naver.com/csaiur/10083194327

 */

class HTMLTags

{

 

    /// <summary>

    /// HTML Tag 모두 제거

    /// </summary>

    /// <param name="str">HTML</param>

    /// <returns></returns>

    public static string StripHTMLTags(string str)

    {

        return Regex.Replace(str, @"<(.|\n)*?>", String.Empty);

    }

 

 

    /// <summary>

    /// 허용한 HTML Tag 제외하고 모두 제거

    /// </summary>

    /// <param name="str">HTML</param>

    /// <param name="strAllowTag">ex) b,span,div</param>

    /// <param name="option">ex) h:header, c:comment, s:style, r:SpecialString</param>

    /// <returns></returns>

    public static string StripHTMLTags(string str, string strAllowTag, string option)

    {

        //헤더제거

        if (option.ToLower().Contains('h'))

            str = RemoveHeaderTitle(str);

        //주석제거

        if (option.ToLower().Contains('c'))

            str = RemoveComment(str);

        //style tag 제거

        if (option.ToLower().Contains('s'))

            str = Regex.Replace(str, @"style=['|""].*?['|""]", string.Empty);

 

        string acceptable = string.Empty;

        string[] AllowTags = strAllowTag.Split(',');

        foreach (string tag in AllowTags)

        {

            if (string.IsNullOrEmpty(acceptable))

                acceptable = string.Format("{0}[\x20/>]|{1}[\x20/>]", tag.ToLower().Trim(), tag.ToUpper().Trim());

            else

                acceptable += string.Format("|{0}[\x20/>]|{1}[\x20/>]", tag.ToLower().Trim(), tag.ToUpper().Trim());

        }

        string stringPattern = @"</?(?(?=" + acceptable + @")notag|[a-zA-Z0-9]+)(?:\s[a-zA-Z0-9\-]+=?(?:(["",']?).*?\1?)?)*\s*/?>";

 

        string removeTag = Regex.Replace(str, stringPattern, string.Empty);

 

        if (option.ToLower().Contains('r'))

            removeTag = ReplaceSpecialString(removeTag);

 

        return removeTag;

    }

 

 

    /// <summary>

    /// Href 부분의 링크만 가져오기

    /// </summary>

    /// <param name="aTag">Href 들어있는 a Tag</param>

    /// <returns></returns>

    public static string GetAtagLink(string aTag)

    {

        var regex = new Regex(@"[Hh][Rr][Ee][Ff]=['|""](.*?)['|""]");

        var match = regex.Match(aTag);

 

        return match.Groups[1].Value;

    }

 

 

 

 

    /// <summary>

    /// Document Type, Title 제거

    /// </summary>

    /// <param name="str"></param>

    /// <returns></returns>

    private static string RemoveHeaderTitle(string str)

    {

        str = Regex.Replace(str, @"<!DOCTYPE.*?>", string.Empty);

        str = Regex.Replace(str, @"\<([Tt][Ii][Tt][Ll][Ee])\>.*?\<\/\1\>", string.Empty);

 

        return str;

    }

 

 

 

    /// <summary>

    /// HTML 주석 제거 <!-- -->

    /// </summary>

    /// <param name="str">HTML</param>

    /// <returns></returns>

    public static string RemoveComment(string str)

    {

        //http://stackoverflow.com/questions/3524317/regex-to-strip-line-comments-from-c-sharp/3524689#3524689

        return Regex.Replace(str, "<!--.*?-->", string.Empty, RegexOptions.Singleline);

 

    }

 

 

    /// <summary>

    /// HTML Tag 모두 특수문자 인코딩으로 치환

    /// </summary>

    /// <param name="str">HTML</param>

    /// <returns></returns>

    public static string HTMLEncode(string str)

    {

        return System.Web.HttpContext.Current.Server.HtmlEncode(str);

    }

 

    /// <summary>

    /// <,> &lt;,&gt; 모두 치환

    /// </summary>

    /// <param name="str">HTML</param>

    /// <returns></returns>

    public static string ReplaceHTMLSpecialChars(string str)

    {

        return Regex.Replace(str, @"<([^<>]+)?>", "&lt;$1&gt;");

    }

 

    /// <summary>

    /// 허용한 HTML 태그를 제외하고 <,> &lt;,&gt; 치환

    /// </summary>

    /// <param name="str">HTML</param>

    /// <param name="strAllowTag">ex) b,span,div</param>

    /// <returns></returns>

    public static string ReplaceHTMLSpecialChars(string str, string strAllowTag)

    {

        string pattern = @"<(\/?)(?!\/####)([^<|>]+)?>";

        string substitute = "&lt;$1$2&gt;";

        string[] allowTags = strAllowTag.Split(',');

        StringBuilder buffer = new StringBuilder();

        for (int i = 0; i < allowTags.Length; i++)

        {

            buffer.Append("|" + allowTags[i].Trim() + @"(?!\w)");

        }

        pattern = pattern.Replace("####", buffer.ToString());

        return Regex.Replace(str, pattern, substitute);

    }

 

 

 

    /// <summary>

    /// 특수문자 변경

    /// </summary>

    /// <param name="str">String</param>

    /// <returns></returns>

    private static string ReplaceSpecialString(string str)

    {

        Dictionary<string, string> dicSpStr = new Dictionary<string, string>

        {

            {"&nbsp;", " "}

            ,{"&lt;", "<"}

            ,{"&gt;", ">"}

            ,{"&amp;", "&"}

            ,{"&quot;", "\""}

            ,{"&lsquo;", "'"}

            ,{"&rsquo;", "'"}

            ,{"&middot;", "·"}

            ,{"&#8228;", "·"}

        };

 

        foreach (KeyValuePair<string, string> spStr in dicSpStr)

        {

            str = str.Replace(spStr.Key, spStr.Value);

        }

 

        return str;

    }

 

 

 

 

}

 

Posted by motolies
,